Skip to content

Commit 84adb65

Browse files
authored
Merge branch 'main' into Feature/BackFiller
2 parents 642328c + 07fe5a1 commit 84adb65

24 files changed

Lines changed: 4176 additions & 2058 deletions

File tree

k8s/welearn-datastack/templates/urlcollectors/cron-workflow.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,10 @@ spec:
393393
- name: corpus_fix
394394
value: "true"
395395

396+
- name: collect-world-bank-okr
397+
templateRef:
398+
name: {{ .name }}
399+
template: collect-world-bank-okr
396400

397401
{{- end }}
398402
{{- end }}

k8s/welearn-datastack/templates/urlcollectors/workflow-template.yaml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -529,4 +529,39 @@ spec:
529529
secretName: {{ $.Values.common.azureShare.secret.name }}
530530
shareName: {{ $.Values.common.azureShare.name }}
531531

532+
533+
- name: collect-world-bank-okr
534+
synchronization:
535+
semaphores:
536+
- configMapKeyRef:
537+
name: {{ .collectorSemaphore.configmapName }}
538+
key: {{ .collectorSemaphore.standard.keyName }}
539+
container:
540+
{{- with $.Values.image }}
541+
image: {{ include "common.pods.image" (dict "root" $ "image" (dict "repository" .repository "path" .path "tag" .tag))}}
542+
{{- end }}
543+
args:
544+
- python
545+
- "-m"
546+
- welearn_datastack.nodes_workflow.URLCollectors.node_world_bank_okr_collect
547+
envFrom:
548+
- configMapRef:
549+
name: {{ .name }}
550+
volumeMounts:
551+
- name: secrets
552+
mountPath: "/secrets"
553+
readOnly: true
554+
- name: azure-share
555+
mountPath: {{ $.Values.common.azureShare.mountPath }}
556+
volumes:
557+
- name: secrets
558+
secret:
559+
secretName: {{ .name }}
560+
- name: azure-share
561+
csi:
562+
driver: file.csi.azure.com
563+
readOnly: true
564+
volumeAttributes:
565+
secretName: {{ $.Values.common.azureShare.secret.name }}
566+
shareName: {{ $.Values.common.azureShare.name }}
532567
{{- end }}

poetry.lock

Lines changed: 1862 additions & 2020 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,42 +23,41 @@ package-mode = false
2323

2424
[tool.poetry.dependencies]
2525
python = ">=3.12,<3.13"
26-
sqlalchemy = "^2.0.46"
27-
numpy = "^2.4.2"
28-
requests = "^2.33.0"
29-
wikipedia-api = "^0.9.0"
26+
sqlalchemy = "^2.0.50"
27+
numpy = "^2.4.6"
28+
requests = "^2.34.2"
29+
wikipedia-api = "^0.15.0"
3030
sentence-transformers = "^5.2.2"
3131
spacy = "^3.8.11"
3232
refinedoc = "^1.0.0"
33-
qdrant-client = "1.16.2"
33+
qdrant-client = "1.18.0"
3434
python-dotenv = "^1.2.2"
3535
beautifulsoup4 = "^4.14.3"
3636
pyphen = "^0.17.2"
37-
ijson = "^3.4.0"
3837
keybert = "^0.9.0"
39-
torch = {version="^2.10.0+cpu", source = "pytorch_cpu"}
40-
torchvision = {version="^0.25.0+cpu", source = "pytorch_cpu"}
38+
torch = {version="^2.12.0+cpu", source = "pytorch_cpu"}
39+
torchvision = {version="^0.27.0+cpu", source = "pytorch_cpu"}
4140
xx_sent_ud_sm = {url = "https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.8.0/xx_sent_ud_sm-3.8.0-py3-none-any.whl"}
4241

43-
lingua-language-detector = "^2.1.1"
44-
psycopg2-binary = "^2.9.10"
42+
lingua-language-detector = "^2.2.0"
43+
psycopg2-binary = "^2.9.12"
4544
brotli = "^1.2.0"
4645
scikit-learn = "~=1.7.0"
4746
optimum = {extras = ["onnxruntime"], version = "^2.1.0"}
4847
azure-storage-blob = "^12.28.0"
4948
welearn-database = "^1.4.4"
5049
pydantic = "^2.12.5"
50+
pydantic = "^2.13.4"
5151

5252
[tool.poetry.group.metrics.dependencies]
53-
alembic = "^1.18.3"
5453
locust = "^2.43.2"
5554

5655

5756
[tool.poetry.group.dev.dependencies]
58-
mypy = "^1.19.1"
59-
bandit = "^1.9.3"
60-
isort = "^7.0.0"
61-
black = "^26.3.1"
57+
mypy = "^2.1.0"
58+
bandit = "^1.9.4"
59+
isort = "^8.0.1"
60+
black = "^26.5.1"
6261

6362
[build-system]
6463
requires = ["poetry-core"]

tests/document_collector_hub/plugins_test/test_unesdoc.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,8 @@
33
from unittest import TestCase
44
from unittest.mock import Mock, patch
55

6-
import pydantic
76
import requests
8-
from requests import HTTPError, RequestException, Response
7+
from requests import HTTPError
98
from welearn_database.data.models import WeLearnDocument
109

1110
from welearn_datastack.data.source_models.unesdoc import UNESDOCItem
@@ -113,7 +112,6 @@ def test__extract_authors(self):
113112
def test__check_licence_authorization_good(self):
114113
tested_licence = "https://creativecommons.org/licenses/by-sa/3.0/igo/"
115114
self.collector._check_licence_authorization(tested_licence)
116-
self.assertTrue(True)
117115

118116
def test__check_licence_authorization_bad(self):
119117
tested_licence = (

0 commit comments

Comments
 (0)