update for new qdrant naming convention

lpi-tn · lpi-tn · commit e00760560fdb · 2025-04-25T11:58:52.000+02:00
diff --git a/README.md b/README.md
@@ -101,9 +101,10 @@ alembic upgrade head
 Assuming you want to ingest corpus "wikipedia" and "PLOS" in english and french for the first one and only in english for the second one.
 You're using **all-minilm-l6-v2** vectorizer for english and **sentence-camembert-base** for the french.
 You need to create the following collections in qdrant:
-- collection_wikipedia_en_all-minilm-l6-v2_v0
-- collection_wikipedia_fr_sentence-camembert-base_v0
-- collection_plos_en_all-minilm-l6-v2_v0
+- collection_welearn_en_all-minilm-l6-v2_v0
+- collection_welearn_fr_sentence-camembert-base_v0
+
+And for retrieving data from specific corpus you must use the `document_corpus` in payload field as it's written in [qdrant documentation](https://qdrant.tech/documentation/guides/multiple-partitions/).
 
 Command for [create collection](https://qdrant.tech/documentation/concepts/collections/#create-a-collection) :
 ```
diff --git a/tests/qdrant_syncronizer/test_qdrant_handler.py b/tests/qdrant_syncronizer/test_qdrant_handler.py
@@ -59,14 +59,14 @@ def setUp(self):
         self.client = QdrantClient(":memory:")
 
         self.client.create_collection(
-            collection_name="collection_en_embmodel",
+            collection_name="collection_welearn_en_embmodel",
             vectors_config=models.VectorParams(
                 size=50, distance=models.Distance.COSINE
             ),
         )
 
         self.client.create_collection(
-            collection_name="collection_fr_embmodel",
+            collection_name="collection_welearn_fr_embmodel",
             vectors_config=models.VectorParams(
                 size=50, distance=models.Distance.COSINE
             ),
@@ -83,7 +83,7 @@ def test_should_get_collections_names_for_given_slices(self):
         slices = [fake_slice]
         collections_names = classify_documents_per_collection(qdrant_connector, slices)
 
-        expected = {"collection_en_embmodel": {fake_slice.document_id}}
+        expected = {"collection_welearn_en_embmodel": {fake_slice.document_id}}
         self.assertEqual(collections_names, expected)
 
     def test_should_handle_multiple_slices_for_same_collection(self):
@@ -101,7 +101,7 @@ def test_should_handle_multiple_slices_for_same_collection(self):
         slices = [fake_slice0, fake_slice1, fake_slice2]
         collections_names = classify_documents_per_collection(qdrant_connector, slices)
         expected = {
-            "collection_en_embmodel": {doc_id0},
-            "collection_fr_embmodel": {doc_id1},
+            "collection_welearn_en_embmodel": {doc_id0},
+            "collection_welearn_fr_embmodel": {doc_id1},
         }
         self.assertEqual(collections_names, expected)
diff --git a/tests/qdrant_syncronizer/test_qdrant_syncronizer.py b/tests/qdrant_syncronizer/test_qdrant_syncronizer.py
@@ -34,12 +34,12 @@ def setUp(self):
         self.client = QdrantClient(":memory:")
 
         self.client.create_collection(
-            collection_name="collection_en_embmodel",
+            collection_name="collection_welearn_en_embmodel",
             vectors_config=models.VectorParams(size=5, distance=models.Distance.COSINE),
         )
 
         self.client.create_collection(
-            collection_name="collection_fr_embmodel",
+            collection_name="collection_welearn_fr_embmodel",
             vectors_config=models.VectorParams(size=5, distance=models.Distance.COSINE),
         )
 
@@ -196,7 +196,7 @@ def test_qdrant_syncronizer(self, mock_create_db_session, mock_qdrant_client):
         self.assertEqual(Step.DOCUMENT_IN_QDRANT.value, most_recent_state.title)
 
         ret_values_from_qdrant = self.client.scroll(
-            collection_name=f"collection_en_embmodel",
+            collection_name=f"collection_welearn_en_embmodel",
             limit=100,
             with_vectors=True,
         )
diff --git a/welearn_datastack/modules/qdrant_handler.py b/welearn_datastack/modules/qdrant_handler.py
@@ -38,7 +38,7 @@ def classify_documents_per_collection(
     for dslice in slices:
         lang = dslice.document.lang
         model = dslice.embedding_model_name
-        collection_name = f"collection_{lang}_{model.lower()}"
+        collection_name = f"collection_welearn_{lang}_{model.lower()}"
 
         if collection_name not in collections_names_in_qdrant:
             logger.error(
diff --git a/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py b/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py
@@ -94,7 +94,7 @@ def main() -> None:
     for i, chunk in enumerate(qdrant_chunk):
         logger.info("Processing chunk: #%s", i)
         slices: Sequence[Type[DocumentSlice]] = (
-            db_session.query(DocumentSlice)
+            db_session.query(DocumentSlice)  # type: ignore
             .filter(DocumentSlice.document_id.in_(chunk))
             .all()
         )
@@ -144,7 +144,7 @@ def main() -> None:
                     document_slices = slices_per_doc[docid]
                     slices_sdgs = retrieve_slices_sdgs(db_session, document_slices)
                     all_document_sdgs = [
-                        slices_sdgs[s.id]
+                        slices_sdgs[s.id]  # type: ignore
                         for s in document_slices
                         if s.id in slices_sdgs
                     ]
@@ -158,7 +158,7 @@ def main() -> None:
                                 convert_slice_in_qdrant_point(
                                     slice_to_convert=doc_slice,
                                     document_sdgs=accurate_sdgs,
-                                    slice_sdg=slices_sdgs[doc_slice.id],
+                                    slice_sdg=slices_sdgs[doc_slice.id],  # type: ignore
                                 )
                             )
 

Original file line number	Diff line number	Diff line change
`@@ -34,12 +34,12 @@ def setUp(self):`
`34`	`34`	`self.client = QdrantClient(":memory:")`
`35`	`35`
`36`	`36`	`self.client.create_collection(`
`37`		`- collection_name="collection_en_embmodel",`
	`37`	`+ collection_name="collection_welearn_en_embmodel",`
`38`	`38`	`vectors_config=models.VectorParams(size=5, distance=models.Distance.COSINE),`
`39`	`39`	`)`
`40`	`40`
`41`	`41`	`self.client.create_collection(`
`42`		`- collection_name="collection_fr_embmodel",`
	`42`	`+ collection_name="collection_welearn_fr_embmodel",`
`43`	`43`	`vectors_config=models.VectorParams(size=5, distance=models.Distance.COSINE),`
`44`	`44`	`)`
`45`	`45`
`@@ -196,7 +196,7 @@ def test_qdrant_syncronizer(self, mock_create_db_session, mock_qdrant_client):`
`196`	`196`	`self.assertEqual(Step.DOCUMENT_IN_QDRANT.value, most_recent_state.title)`
`197`	`197`
`198`	`198`	`ret_values_from_qdrant = self.client.scroll(`
`199`		`- collection_name=f"collection_en_embmodel",`
	`199`	`+ collection_name=f"collection_welearn_en_embmodel",`
`200`	`200`	`limit=100,`
`201`	`201`	`with_vectors=True,`
`202`	`202`	`)`
Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,7 @@ def main() -> None:`
`94`	`94`	`for i, chunk in enumerate(qdrant_chunk):`
`95`	`95`	`logger.info("Processing chunk: #%s", i)`
`96`	`96`	`slices: Sequence[Type[DocumentSlice]] = (`
`97`		`- db_session.query(DocumentSlice)`
	`97`	`+ db_session.query(DocumentSlice) # type: ignore`
`98`	`98`	`.filter(DocumentSlice.document_id.in_(chunk))`
`99`	`99`	`.all()`
`100`	`100`	`)`
`@@ -144,7 +144,7 @@ def main() -> None:`
`144`	`144`	`document_slices = slices_per_doc[docid]`
`145`	`145`	`slices_sdgs = retrieve_slices_sdgs(db_session, document_slices)`
`146`	`146`	`all_document_sdgs = [`
`147`		`- slices_sdgs[s.id]`
	`147`	`+ slices_sdgs[s.id] # type: ignore`
`148`	`148`	`for s in document_slices`
`149`	`149`	`if s.id in slices_sdgs`
`150`	`150`	`]`
`@@ -158,7 +158,7 @@ def main() -> None:`
`158`	`158`	`convert_slice_in_qdrant_point(`
`159`	`159`	`slice_to_convert=doc_slice,`
`160`	`160`	`document_sdgs=accurate_sdgs,`
`161`		`- slice_sdg=slices_sdgs[doc_slice.id],`
	`161`	`+ slice_sdg=slices_sdgs[doc_slice.id], # type: ignore`
`162`	`162`	`)`
`163`	`163`	`)`
`164`	`164`