Skip to content

Commit 240ec9b

Browse files
authored
Merge pull request #2 from CyberCRI/Feature/wf-wikipedia-updater
Add the workflow template for wikipedia updater
2 parents 78ca495 + d69368a commit 240ec9b

File tree

4 files changed

+340
-0
lines changed

4 files changed

+340
-0
lines changed
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{{- with .Values.wikipediaUpdater }}
2+
{{- $config := dict}}
3+
{{- $_ := merge $config $.Values.common.config}}
4+
{{- $_ := merge $config .config}}
5+
6+
{{- $commonLabels := dict }}
7+
{{- $_ := merge $commonLabels $.Values.common.commonLabels}}
8+
{{- $_ := merge $commonLabels .commonLabels}}
9+
10+
{{- $configGenerator := dict
11+
"config" $config
12+
"root" $
13+
"commonLabels" $commonLabels
14+
"name" .name }}
15+
16+
{{- include "common.config.resources" $configGenerator}}
17+
{{- end }}
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
{{- with .Values.wikipediaUpdater }}
2+
{{- if and (.cronWorkflowTemplate.standard.enabled) ($.Values.cronWorkflows.enabled) }}
3+
kind: CronWorkflow
4+
apiVersion: argoproj.io/v1alpha1
5+
metadata:
6+
name: {{ .name }}
7+
spec:
8+
workflowSpec:
9+
{{- if $.Values.common.runOnGpu }}
10+
affinity:
11+
nodeAffinity:
12+
preferredDuringSchedulingIgnoredDuringExecution:
13+
- weight: 1
14+
preference:
15+
matchExpressions:
16+
- key: gpu-node
17+
operator: In
18+
values:
19+
- "true"
20+
tolerations:
21+
- key: "Gpu"
22+
operator: "Equal"
23+
value: "true"
24+
effect: "NoSchedule"
25+
{{- end }}
26+
securityContext:
27+
{{- include "common.security.podSecurityContext.restricted" dict | nindent 6 }}
28+
templates:
29+
- name: all
30+
inputs:
31+
parameters:
32+
- name: csv_path
33+
default: /etc/welearndatastack/collectors
34+
- name: parallelism_treshold
35+
default: '500'
36+
- name: parallelism_max
37+
default: '15'
38+
- name: retrieval_mode
39+
enum: ["NEW_MODE", "UPDATE_MODE"]
40+
default: "NEW_MODE"
41+
- name: pick_qty_max
42+
default: '1000'
43+
- name: log_level
44+
default: "INFO"
45+
- name: batch_urls_directory
46+
default: batch_urls
47+
- name: memory_batch_urls
48+
default: {{ .workflowTemplates.prepareBatches.requests.memory }}
49+
- name: batch_urls_ids_file_name
50+
default: "batch_ids.csv"
51+
- name: memory_collect_docs
52+
default: {{ .workflowTemplates.runNode.requests.memory }}
53+
- name: size_limit
54+
default: 10000000000 # In bytes
55+
steps:
56+
- - name: generate-wikipedia-batch
57+
templateRef:
58+
name: {{ .name }}-prepare-batches
59+
template: generate-to-extract-batch
60+
arguments:
61+
parameters:
62+
- name: log_level
63+
value: >-
64+
{{ print "{{inputs.parameters.log_level}}" }}
65+
- name: parallelism_treshold
66+
value: >-
67+
{{ print "{{inputs.parameters.parallelism_treshold}}" }}
68+
- name: parallelism_max
69+
value: >-
70+
{{ print "{{inputs.parameters.parallelism_max}}" }}
71+
- name: batch_urls_directory
72+
value: >-
73+
{{ print "{{inputs.parameters.batch_urls_directory}}" }}
74+
- name: retrieval_mode
75+
value: >-
76+
{{ print "{{inputs.parameters.retrieval_mode}}" }}
77+
- name: memory
78+
value: >-
79+
{{ print "{{inputs.parameters.memory_batch_urls}}" }}
80+
- name: batch_urls_count_file_name
81+
value: "quantity.txt"
82+
- name: pick_qty_max
83+
value: >-
84+
{{ print "{{inputs.parameters.pick_qty_max}}" }}
85+
- name: output_filename
86+
value: >-
87+
{{ print "{{inputs.parameters.batch_urls_ids_file_name}}" }}
88+
- name: size_limit
89+
value: >-
90+
{{ print "{{inputs.parameters.size_limit}}" }}
91+
- - name: wikipedia-updater
92+
templateRef:
93+
name: {{ .name }}-run-node
94+
template: wikipedia-updater
95+
arguments:
96+
parameters:
97+
- name: log_level
98+
value: >-
99+
{{ print "{{inputs.parameters.log_level}}" }}
100+
- name: urls_input_filename
101+
value: >-
102+
{{ print "{{item}}_{{inputs.parameters.batch_urls_ids_file_name}}" }}
103+
- name: urls_directory
104+
value: >-
105+
{{ print "{{inputs.parameters.batch_urls_directory}}" }}
106+
default: urls
107+
- name: memory
108+
value: >-
109+
{{ print "{{inputs.parameters.memory_collect_docs}}" }}
110+
artifacts:
111+
- name: batch_ids_csv
112+
from: >-
113+
{{ print "{{steps.generate-to-extract-batch.outputs.artifacts.batch_ids_csv}}" }}
114+
withSequence:
115+
count: >-
116+
{{ print "{{steps.generate-to-extract-batch.outputs.parameters.url_batches_count}}" }}
117+
118+
entrypoint: all
119+
serviceAccountName: {{ $.Values.common.workflowRbac.serviceAccountName }}
120+
schedules: [{{ .cronWorkflowTemplate.standard.schedule }}]
121+
{{- end }}
122+
{{- end }}
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
{{- with .Values.wikipediaUpdater }}
2+
apiVersion: argoproj.io/v1alpha1
3+
kind: WorkflowTemplate
4+
metadata:
5+
name: {{ .name }}-run-node
6+
spec:
7+
entrypoint: all
8+
serviceAccountName: {{ $.Values.common.workflowRbac.serviceAccountName }}
9+
securityContext:
10+
{{- include "common.security.podSecurityContext.restricted" dict | nindent 4 }}
11+
{{- if $.Values.common.runOnGpu }}
12+
affinity:
13+
nodeAffinity:
14+
preferredDuringSchedulingIgnoredDuringExecution:
15+
- weight: 1
16+
preference:
17+
matchExpressions:
18+
- key: gpu-node
19+
operator: In
20+
values:
21+
- "true"
22+
tolerations:
23+
- key: "Gpu"
24+
operator: "Equal"
25+
value: "true"
26+
effect: "NoSchedule"
27+
{{- end }}
28+
templates:
29+
30+
- name: wikipedia-updater
31+
podSpecPatch: >-
32+
{{ print "{\"containers\":[{\"name\":\"main\", \"resources\":{\"requests\": {\"memory\": \"{{inputs.parameters.memory}}\"}}}]}" }}
33+
inputs:
34+
parameters:
35+
- name: log_level
36+
default: "INFO"
37+
- name: urls_input_filename
38+
- name: urls_directory
39+
default: urls
40+
- name: memory
41+
default: {{ .workflowTemplates.runNode.requests.memory }}
42+
- name: size_limit
43+
default: 10000000000
44+
artifacts:
45+
- name: batch_ids_csv
46+
path: >-
47+
{{ printf "%s/%s/{{inputs.parameters.urls_directory}}" $.Values.common.artifactRoot $.Values.common.artifactInputFolder }}
48+
container:
49+
{{- with $.Values.image }}
50+
image: {{ tpl (printf "%s/%s:%s" .repository .path .tag) $ }}
51+
{{- end }}
52+
envFrom:
53+
- configMapRef:
54+
name: {{ .name }}
55+
env:
56+
- name: ARTIFACT_ROOT
57+
value: {{ $.Values.common.artifactRoot }}
58+
- name: IS_LOCAL
59+
value: "0"
60+
- name: LOG_LEVEL
61+
value: >-
62+
{{ print "{{inputs.parameters.log_level}}" }}
63+
- name: ARTIFACT_ID_URL_CSV_NAME
64+
value: >-
65+
{{ print "{{inputs.parameters.urls_directory}}/{{inputs.parameters.urls_input_filename}}" }}
66+
67+
args:
68+
- python
69+
- "-m"
70+
- welearn_datastack.nodes_workflow.WikipediaUpdater.wikipedia_updater
71+
volumeMounts:
72+
- name: secrets
73+
mountPath: "/secrets"
74+
readOnly: true
75+
volumes:
76+
- name: secrets
77+
secret:
78+
secretName: {{ .name }}
79+
{{- end }}
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
{{- with .Values.wikipediaUpdater }}
2+
apiVersion: argoproj.io/v1alpha1
3+
kind: WorkflowTemplate
4+
metadata:
5+
name: {{ .name }}-prepare-batches
6+
spec:
7+
entrypoint: all
8+
serviceAccountName: {{ $.Values.common.workflowRbac.serviceAccountName }}
9+
securityContext:
10+
{{- include "common.security.podSecurityContext.restricted" dict | nindent 4 }}
11+
{{- if $.Values.common.runOnGpu }}
12+
affinity:
13+
nodeAffinity:
14+
preferredDuringSchedulingIgnoredDuringExecution:
15+
- weight: 1
16+
preference:
17+
matchExpressions:
18+
- key: gpu-node
19+
operator: In
20+
values:
21+
- "true"
22+
tolerations:
23+
- key: "Gpu"
24+
operator: "Equal"
25+
value: "true"
26+
effect: "NoSchedule"
27+
{{- end }}
28+
templates:
29+
- name: generate-wikipedia-batch
30+
podSpecPatch: >-
31+
{{ print "{\"containers\":[{\"name\":\"main\", \"resources\":{\"limits\": {\"memory\": \"{{inputs.parameters.memory}}\"}, \"requests\": {\"memory\": \"{{inputs.parameters.memory}}\"}}}]}" }}
32+
inputs:
33+
parameters:
34+
- name: parallelism_treshold
35+
default: 100000
36+
- name: parallelism_max
37+
default: 15
38+
- name: batch_urls_directory
39+
default: batch_urls
40+
- name: retrieval_mode
41+
enum: ["NEW_MODE", "UPDATE_MODE"]
42+
default: "NEW_MODE"
43+
- name: pick_qty_max
44+
default: 100000
45+
- name: memory
46+
default: {{ .workflowTemplates.prepareBatches.requests.memory }}
47+
- name: batch_urls_count_file_name
48+
default: "quantity.txt"
49+
- name: output_filename
50+
default: "batch_ids.csv"
51+
- name: log_level
52+
default: "INFO"
53+
- name: size_limit
54+
default: 10000000000
55+
outputs:
56+
artifacts:
57+
- name: batch_ids_csv
58+
path: >-
59+
{{ printf "%s/%s/{{inputs.parameters.batch_urls_directory}}" $.Values.common.artifactRoot $.Values.common.artifactOutputFolder }}
60+
parameters:
61+
- name: url_batches_count
62+
valueFrom:
63+
path: >-
64+
{{ printf "%s/%s/{{inputs.parameters.batch_urls_directory}}/{{inputs.parameters.batch_urls_count_file_name}}" $.Values.common.artifactRoot $.Values.common.artifactOutputFolder }}
65+
container:
66+
{{- with $.Values.image }}
67+
image: {{ tpl (printf "%s/%s:%s" .repository .path .tag) $ }}
68+
{{- end }}
69+
envFrom:
70+
- configMapRef:
71+
name: {{ .name }}
72+
env:
73+
- name: ARTIFACT_ROOT
74+
value: {{ $.Values.common.artifactRoot }}
75+
- name: IS_LOCAL
76+
value: "0"
77+
- name: PARALLELISM_THRESHOLD
78+
value: >-
79+
{{ print "{{inputs.parameters.parallelism_treshold}}" }}
80+
- name: PARALLELISM_URL_MAX
81+
value: >-
82+
{{ print "{{inputs.parameters.parallelism_max}}" }}
83+
- name: BATCH_URLS_DIRECTORY
84+
value: >-
85+
{{ print "{{inputs.parameters.batch_urls_directory}}" }}
86+
- name: RETRIEVAL_MODE
87+
value: >-
88+
{{ print "{{inputs.parameters.retrieval_mode }}" }}
89+
- name: PICK_QTY_MAX
90+
value: >-
91+
{{ print "{{inputs.parameters.pick_qty_max }}" }}
92+
- name: LOG_LEVEL
93+
value: >-
94+
{{ print "{{inputs.parameters.log_level}}" }}
95+
- name: OUTPUT_FILE_NAME
96+
value: >-
97+
{{ print "{{inputs.parameters.output_filename}}" }}
98+
- name: SIZE_TOTAL_LIMIT
99+
value: >-
100+
{{ print "{{inputs.parameters.size_limit}}" }}
101+
args:
102+
- python
103+
- "-m"
104+
- welearn_datastack.nodes_workflow.WikipediaUpdater.generate_to_update_batch
105+
106+
volumeMounts:
107+
- name: secrets
108+
mountPath: "/secrets"
109+
readOnly: true
110+
111+
volumes:
112+
- name: secrets
113+
secret:
114+
secretName: {{ .name }}
115+
- name: azure-share
116+
csi:
117+
driver: file.csi.azure.com
118+
readOnly: true
119+
volumeAttributes:
120+
secretName: {{ $.Values.common.azureShare.secret.name }}
121+
shareName: {{ $.Values.common.azureShare.name }}
122+
{{- end }}

0 commit comments

Comments
 (0)