Skip to content

Commit e007605

Browse files
committed
update for new qdrant naming convention
1 parent 00d9ee8 commit e007605

5 files changed

Lines changed: 16 additions & 15 deletions

File tree

README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,10 @@ alembic upgrade head
101101
Assuming you want to ingest corpus "wikipedia" and "PLOS" in english and french for the first one and only in english for the second one.
102102
You're using **all-minilm-l6-v2** vectorizer for english and **sentence-camembert-base** for the french.
103103
You need to create the following collections in qdrant:
104-
- collection_wikipedia_en_all-minilm-l6-v2_v0
105-
- collection_wikipedia_fr_sentence-camembert-base_v0
106-
- collection_plos_en_all-minilm-l6-v2_v0
104+
- collection_welearn_en_all-minilm-l6-v2_v0
105+
- collection_welearn_fr_sentence-camembert-base_v0
106+
107+
And for retrieving data from specific corpus you must use the `document_corpus` in payload field as it's written in [qdrant documentation](https://qdrant.tech/documentation/guides/multiple-partitions/).
107108

108109
Command for [create collection](https://qdrant.tech/documentation/concepts/collections/#create-a-collection) :
109110
```

tests/qdrant_syncronizer/test_qdrant_handler.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,14 @@ def setUp(self):
5959
self.client = QdrantClient(":memory:")
6060

6161
self.client.create_collection(
62-
collection_name="collection_en_embmodel",
62+
collection_name="collection_welearn_en_embmodel",
6363
vectors_config=models.VectorParams(
6464
size=50, distance=models.Distance.COSINE
6565
),
6666
)
6767

6868
self.client.create_collection(
69-
collection_name="collection_fr_embmodel",
69+
collection_name="collection_welearn_fr_embmodel",
7070
vectors_config=models.VectorParams(
7171
size=50, distance=models.Distance.COSINE
7272
),
@@ -83,7 +83,7 @@ def test_should_get_collections_names_for_given_slices(self):
8383
slices = [fake_slice]
8484
collections_names = classify_documents_per_collection(qdrant_connector, slices)
8585

86-
expected = {"collection_en_embmodel": {fake_slice.document_id}}
86+
expected = {"collection_welearn_en_embmodel": {fake_slice.document_id}}
8787
self.assertEqual(collections_names, expected)
8888

8989
def test_should_handle_multiple_slices_for_same_collection(self):
@@ -101,7 +101,7 @@ def test_should_handle_multiple_slices_for_same_collection(self):
101101
slices = [fake_slice0, fake_slice1, fake_slice2]
102102
collections_names = classify_documents_per_collection(qdrant_connector, slices)
103103
expected = {
104-
"collection_en_embmodel": {doc_id0},
105-
"collection_fr_embmodel": {doc_id1},
104+
"collection_welearn_en_embmodel": {doc_id0},
105+
"collection_welearn_fr_embmodel": {doc_id1},
106106
}
107107
self.assertEqual(collections_names, expected)

tests/qdrant_syncronizer/test_qdrant_syncronizer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,12 @@ def setUp(self):
3434
self.client = QdrantClient(":memory:")
3535

3636
self.client.create_collection(
37-
collection_name="collection_en_embmodel",
37+
collection_name="collection_welearn_en_embmodel",
3838
vectors_config=models.VectorParams(size=5, distance=models.Distance.COSINE),
3939
)
4040

4141
self.client.create_collection(
42-
collection_name="collection_fr_embmodel",
42+
collection_name="collection_welearn_fr_embmodel",
4343
vectors_config=models.VectorParams(size=5, distance=models.Distance.COSINE),
4444
)
4545

@@ -196,7 +196,7 @@ def test_qdrant_syncronizer(self, mock_create_db_session, mock_qdrant_client):
196196
self.assertEqual(Step.DOCUMENT_IN_QDRANT.value, most_recent_state.title)
197197

198198
ret_values_from_qdrant = self.client.scroll(
199-
collection_name=f"collection_en_embmodel",
199+
collection_name=f"collection_welearn_en_embmodel",
200200
limit=100,
201201
with_vectors=True,
202202
)

welearn_datastack/modules/qdrant_handler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def classify_documents_per_collection(
3838
for dslice in slices:
3939
lang = dslice.document.lang
4040
model = dslice.embedding_model_name
41-
collection_name = f"collection_{lang}_{model.lower()}"
41+
collection_name = f"collection_welearn_{lang}_{model.lower()}"
4242

4343
if collection_name not in collections_names_in_qdrant:
4444
logger.error(

welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def main() -> None:
9494
for i, chunk in enumerate(qdrant_chunk):
9595
logger.info("Processing chunk: #%s", i)
9696
slices: Sequence[Type[DocumentSlice]] = (
97-
db_session.query(DocumentSlice)
97+
db_session.query(DocumentSlice) # type: ignore
9898
.filter(DocumentSlice.document_id.in_(chunk))
9999
.all()
100100
)
@@ -144,7 +144,7 @@ def main() -> None:
144144
document_slices = slices_per_doc[docid]
145145
slices_sdgs = retrieve_slices_sdgs(db_session, document_slices)
146146
all_document_sdgs = [
147-
slices_sdgs[s.id]
147+
slices_sdgs[s.id] # type: ignore
148148
for s in document_slices
149149
if s.id in slices_sdgs
150150
]
@@ -158,7 +158,7 @@ def main() -> None:
158158
convert_slice_in_qdrant_point(
159159
slice_to_convert=doc_slice,
160160
document_sdgs=accurate_sdgs,
161-
slice_sdg=slices_sdgs[doc_slice.id],
161+
slice_sdg=slices_sdgs[doc_slice.id], # type: ignore
162162
)
163163
)
164164

0 commit comments

Comments
 (0)