Skip to content

Commit fd6556b

Browse files
authored
Merge pull request #9 from CyberCRI/Feature/usc-wf
Feature/usc wf
2 parents 0b3cfba + d77ab1e commit fd6556b

File tree

6 files changed

+366
-3
lines changed

6 files changed

+366
-3
lines changed
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{{- with .Values.urlSanitaryCrawler }}
2+
{{- $config := dict}}
3+
{{- $_ := merge $config $.Values.common.config}}
4+
{{- $_ := merge $config .config}}
5+
6+
{{- $commonLabels := dict }}
7+
{{- $_ := merge $commonLabels $.Values.common.commonLabels}}
8+
{{- $_ := merge $commonLabels .commonLabels}}
9+
10+
{{- $configGenerator := dict
11+
"config" $config
12+
"root" $
13+
"commonLabels" $commonLabels
14+
"name" .name }}
15+
16+
{{- include "common.config.resources" $configGenerator}}
17+
{{- end }}
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
{{- with .Values.urlSanitaryCrawler }}
2+
{{- if and (.cronWorkflowTemplate.standard.enabled) ($.Values.cronWorkflows.enabled) }}
3+
kind: CronWorkflow
4+
apiVersion: argoproj.io/v1alpha1
5+
metadata:
6+
name: {{ .name }}
7+
spec:
8+
workflowSpec:
9+
{{- if $.Values.common.runOnGpu }}
10+
affinity:
11+
nodeAffinity:
12+
preferredDuringSchedulingIgnoredDuringExecution:
13+
- weight: 1
14+
preference:
15+
matchExpressions:
16+
- key: gpu-node
17+
operator: In
18+
values:
19+
- "true"
20+
tolerations:
21+
- key: "Gpu"
22+
operator: "Equal"
23+
value: "true"
24+
effect: "NoSchedule"
25+
{{- end }}
26+
securityContext:
27+
{{- include "common.security.podSecurityContext.restricted" dict | nindent 6 }}
28+
templates:
29+
- name: all
30+
inputs:
31+
parameters:
32+
- name: csv_path
33+
default: /etc/welearndatastack/collectors
34+
- name: parallelism_treshold
35+
default: '500'
36+
- name: parallelism_max
37+
default: '15'
38+
- name: retrieval_mode
39+
enum: ["NEW_MODE", "UPDATE_MODE"]
40+
default: "NEW_MODE"
41+
- name: pick_qty_max
42+
default: '1000'
43+
- name: log_level
44+
default: "INFO"
45+
- name: batch_urls_directory
46+
default: batch_urls
47+
- name: memory_batch_urls
48+
default: {{ .workflowTemplates.prepareBatches.requests.memory }}
49+
- name: batch_urls_ids_file_name
50+
default: "batch_ids.csv"
51+
- name: memory_collect_docs
52+
default: {{ .workflowTemplates.runNode.requests.memory }}
53+
- name: size_limit
54+
default: 10000000000 # In bytes
55+
steps:
56+
- - name: generate-to-sanitize-batch
57+
templateRef:
58+
name: {{ .name }}-prepare-batches
59+
template: generate-to-sanitize-batch
60+
arguments:
61+
parameters:
62+
- name: log_level
63+
value: >-
64+
{{ print "{{inputs.parameters.log_level}}" }}
65+
- name: parallelism_treshold
66+
value: >-
67+
{{ print "{{inputs.parameters.parallelism_treshold}}" }}
68+
- name: parallelism_max
69+
value: >-
70+
{{ print "{{inputs.parameters.parallelism_max}}" }}
71+
- name: batch_urls_directory
72+
value: >-
73+
{{ print "{{inputs.parameters.batch_urls_directory}}" }}
74+
- name: retrieval_mode
75+
value: >-
76+
{{ print "{{inputs.parameters.retrieval_mode}}" }}
77+
- name: memory
78+
value: >-
79+
{{ print "{{inputs.parameters.memory_batch_urls}}" }}
80+
- name: batch_urls_count_file_name
81+
value: "quantity.txt"
82+
- name: pick_qty_max
83+
value: >-
84+
{{ print "{{inputs.parameters.pick_qty_max}}" }}
85+
- name: output_filename
86+
value: >-
87+
{{ print "{{inputs.parameters.batch_urls_ids_file_name}}" }}
88+
- name: size_limit
89+
value: >-
90+
{{ print "{{inputs.parameters.size_limit}}" }}
91+
- - name: url-sanitary-crawler
92+
templateRef:
93+
name: {{ .name }}-run-node
94+
template: url-sanitary-crawler
95+
arguments:
96+
parameters:
97+
- name: log_level
98+
value: >-
99+
{{ print "{{inputs.parameters.log_level}}" }}
100+
- name: urls_input_filename
101+
value: >-
102+
{{ print "{{item}}_{{inputs.parameters.batch_urls_ids_file_name}}" }}
103+
- name: urls_directory
104+
value: >-
105+
{{ print "{{inputs.parameters.batch_urls_directory}}" }}
106+
default: urls
107+
- name: memory
108+
value: >-
109+
{{ print "{{inputs.parameters.memory_collect_docs}}" }}
110+
artifacts:
111+
- name: batch_ids_csv
112+
from: >-
113+
{{ print "{{steps.generate-to-sanitize-batch.outputs.artifacts.batch_ids_csv}}" }}
114+
withSequence:
115+
count: >-
116+
{{ print "{{steps.generate-to-sanitize-batch.outputs.parameters.url_batches_count}}" }}
117+
118+
entrypoint: all
119+
serviceAccountName: {{ $.Values.common.workflowRbac.serviceAccountName }}
120+
schedules: [{{ .cronWorkflowTemplate.standard.schedule }}]
121+
{{- end }}
122+
{{- end }}
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
{{- with .Values.urlSanitaryCrawler }}
2+
apiVersion: argoproj.io/v1alpha1
3+
kind: WorkflowTemplate
4+
metadata:
5+
name: {{ .name }}-prepare-batches
6+
spec:
7+
entrypoint: all
8+
serviceAccountName: {{ $.Values.common.workflowRbac.serviceAccountName }}
9+
securityContext:
10+
{{- include "common.security.podSecurityContext.restricted" dict | nindent 4 }}
11+
{{- if $.Values.common.runOnGpu }}
12+
affinity:
13+
nodeAffinity:
14+
preferredDuringSchedulingIgnoredDuringExecution:
15+
- weight: 1
16+
preference:
17+
matchExpressions:
18+
- key: gpu-node
19+
operator: In
20+
values:
21+
- "true"
22+
tolerations:
23+
- key: "Gpu"
24+
operator: "Equal"
25+
value: "true"
26+
effect: "NoSchedule"
27+
{{- end }}
28+
templates:
29+
- name: generate-to-sanitize-batch
30+
podSpecPatch: >-
31+
{{ print "{\"containers\":[{\"name\":\"main\", \"resources\":{\"limits\": {\"memory\": \"{{inputs.parameters.memory}}\"}, \"requests\": {\"memory\": \"{{inputs.parameters.memory}}\"}}}]}" }}
32+
inputs:
33+
parameters:
34+
- name: parallelism_treshold
35+
default: 100000
36+
- name: parallelism_max
37+
default: 15
38+
- name: batch_urls_directory
39+
default: batch_urls
40+
- name: retrieval_mode
41+
enum: ["NEW_MODE", "UPDATE_MODE"]
42+
default: "NEW_MODE"
43+
- name: pick_qty_max
44+
default: 100000
45+
- name: memory
46+
default: {{ .workflowTemplates.prepareBatches.requests.memory }}
47+
- name: batch_urls_count_file_name
48+
default: "quantity.txt"
49+
- name: output_filename
50+
default: "batch_ids.csv"
51+
- name: log_level
52+
default: "INFO"
53+
- name: size_limit
54+
default: 10000000000
55+
outputs:
56+
artifacts:
57+
- name: batch_ids_csv
58+
path: >-
59+
{{ printf "%s/%s/{{inputs.parameters.batch_urls_directory}}" $.Values.common.artifactRoot $.Values.common.artifactOutputFolder }}
60+
parameters:
61+
- name: url_batches_count
62+
valueFrom:
63+
path: >-
64+
{{ printf "%s/%s/{{inputs.parameters.batch_urls_directory}}/{{inputs.parameters.batch_urls_count_file_name}}" $.Values.common.artifactRoot $.Values.common.artifactOutputFolder }}
65+
container:
66+
{{- with $.Values.image }}
67+
image: {{ tpl (printf "%s/%s:%s" .repository .path .tag) $ }}
68+
{{- end }}
69+
envFrom:
70+
- configMapRef:
71+
name: {{ .name }}
72+
env:
73+
- name: ARTIFACT_ROOT
74+
value: {{ $.Values.common.artifactRoot }}
75+
- name: IS_LOCAL
76+
value: "0"
77+
- name: PARALLELISM_THRESHOLD
78+
value: >-
79+
{{ print "{{inputs.parameters.parallelism_treshold}}" }}
80+
- name: PARALLELISM_URL_MAX
81+
value: >-
82+
{{ print "{{inputs.parameters.parallelism_max}}" }}
83+
- name: BATCH_URLS_DIRECTORY
84+
value: >-
85+
{{ print "{{inputs.parameters.batch_urls_directory}}" }}
86+
- name: RETRIEVAL_MODE
87+
value: >-
88+
{{ print "{{inputs.parameters.retrieval_mode }}" }}
89+
- name: PICK_QTY_MAX
90+
value: >-
91+
{{ print "{{inputs.parameters.pick_qty_max }}" }}
92+
- name: LOG_LEVEL
93+
value: >-
94+
{{ print "{{inputs.parameters.log_level}}" }}
95+
- name: OUTPUT_FILE_NAME
96+
value: >-
97+
{{ print "{{inputs.parameters.output_filename}}" }}
98+
- name: SIZE_TOTAL_LIMIT
99+
value: >-
100+
{{ print "{{inputs.parameters.size_limit}}" }}
101+
args:
102+
- python
103+
- "-m"
104+
- welearn_datastack.nodes_workflow.URLSanitaryCrawler.generate_to_sanitize_batch
105+
106+
volumeMounts:
107+
- name: secrets
108+
mountPath: "/secrets"
109+
readOnly: true
110+
111+
volumes:
112+
- name: secrets
113+
secret:
114+
secretName: {{ .name }}
115+
- name: azure-share
116+
csi:
117+
driver: file.csi.azure.com
118+
readOnly: true
119+
volumeAttributes:
120+
secretName: {{ $.Values.common.azureShare.secret.name }}
121+
shareName: {{ $.Values.common.azureShare.name }}
122+
{{- end }}
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
{{- with .Values.urlSanitaryCrawler }}
2+
apiVersion: argoproj.io/v1alpha1
3+
kind: WorkflowTemplate
4+
metadata:
5+
name: {{ .name }}-run-node
6+
spec:
7+
entrypoint: all
8+
serviceAccountName: {{ $.Values.common.workflowRbac.serviceAccountName }}
9+
securityContext:
10+
{{- include "common.security.podSecurityContext.restricted" dict | nindent 4 }}
11+
{{- if $.Values.common.runOnGpu }}
12+
affinity:
13+
nodeAffinity:
14+
preferredDuringSchedulingIgnoredDuringExecution:
15+
- weight: 1
16+
preference:
17+
matchExpressions:
18+
- key: gpu-node
19+
operator: In
20+
values:
21+
- "true"
22+
tolerations:
23+
- key: "Gpu"
24+
operator: "Equal"
25+
value: "true"
26+
effect: "NoSchedule"
27+
{{- end }}
28+
templates:
29+
30+
- name: url-sanitary-crawler
31+
podSpecPatch: >-
32+
{{ print "{\"containers\":[{\"name\":\"main\", \"resources\":{\"requests\": {\"memory\": \"{{inputs.parameters.memory}}\"}}}]}" }}
33+
inputs:
34+
parameters:
35+
- name: log_level
36+
default: "INFO"
37+
- name: urls_input_filename
38+
- name: urls_directory
39+
default: urls
40+
- name: memory
41+
default: {{ .workflowTemplates.runNode.requests.memory }}
42+
- name: size_limit
43+
default: 10000000000
44+
artifacts:
45+
- name: batch_ids_csv
46+
path: >-
47+
{{ printf "%s/%s/{{inputs.parameters.urls_directory}}" $.Values.common.artifactRoot $.Values.common.artifactInputFolder }}
48+
container:
49+
{{- with $.Values.image }}
50+
image: {{ tpl (printf "%s/%s:%s" .repository .path .tag) $ }}
51+
{{- end }}
52+
envFrom:
53+
- configMapRef:
54+
name: {{ .name }}
55+
env:
56+
- name: ARTIFACT_ROOT
57+
value: {{ $.Values.common.artifactRoot }}
58+
- name: IS_LOCAL
59+
value: "0"
60+
- name: LOG_LEVEL
61+
value: >-
62+
{{ print "{{inputs.parameters.log_level}}" }}
63+
- name: ARTIFACT_ID_URL_CSV_NAME
64+
value: >-
65+
{{ print "{{inputs.parameters.urls_directory}}/{{inputs.parameters.urls_input_filename}}" }}
66+
67+
args:
68+
- python
69+
- "-m"
70+
- welearn_datastack.nodes_workflow.URLSanitaryCrawler.url_sanitary_crawler
71+
volumeMounts:
72+
- name: secrets
73+
mountPath: "/secrets"
74+
readOnly: true
75+
volumes:
76+
- name: secrets
77+
secret:
78+
secretName: {{ .name }}
79+
{{- end }}

k8s/welearn-datastack/values.yaml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,32 @@ wikipediaUpdater:
227227
schedule: 40 */1 * * *
228228

229229

230+
urlSanitaryCrawler:
231+
name: url-sanitary-crawler
232+
config:
233+
sensitive: {}
234+
nonSensitive: {}
235+
236+
resourcesPrefix: url-sanitary-crawler
237+
238+
commonLabels:
239+
app.kubernetes.io/component: url-sanitary-crawler
240+
241+
workflowTemplates:
242+
prepareBatches:
243+
requests:
244+
memory: 75M
245+
runNode:
246+
requests:
247+
memory: 20M
248+
249+
cronWorkflowTemplate:
250+
standard:
251+
enabled: true
252+
schedule: 40 */1 * * *
253+
254+
255+
230256
qdrantSyncronizer:
231257
name: qdrant-synchronizer
232258
config:

welearn_datastack/nodes_workflow/URLSanitaryCrawler/url_sanitary_crawler.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,6 @@ def main() -> None:
5959
)
6060
logger.info("'%s' WeLearnDocuments were retrieved", len(welearn_documents))
6161

62-
wlds_ids_to_update: List[Tuple[UUID, int]] = []
63-
wlds_ids_to_delete: List[Tuple[UUID, int]] = []
64-
6562
# Check url
6663
logger.info("Check URL state")
6764

0 commit comments

Comments
 (0)