Merge pull request #9 from CyberCRI/Feature/usc-wf

lpi-tn · web-flow · commit fd6556bb4e93 · 2025-03-27T17:00:50.000+01:00
Feature/usc wf
diff --git a/k8s/welearn-datastack/templates/urlSanitaryCrawler/config.yaml b/k8s/welearn-datastack/templates/urlSanitaryCrawler/config.yaml
@@ -0,0 +1,17 @@
+{{- with .Values.urlSanitaryCrawler }}
+{{- $config := dict}}
+{{- $_ := merge $config $.Values.common.config}}
+{{- $_ := merge $config .config}}
+
+{{- $commonLabels := dict }}
+{{- $_ := merge $commonLabels $.Values.common.commonLabels}}
+{{- $_ := merge $commonLabels .commonLabels}}
+
+{{- $configGenerator := dict
+"config" $config
+"root" $
+"commonLabels" $commonLabels
+"name" .name }}
+
+{{- include "common.config.resources" $configGenerator}}
+{{- end }}
diff --git a/k8s/welearn-datastack/templates/urlSanitaryCrawler/cron-workflow.yaml b/k8s/welearn-datastack/templates/urlSanitaryCrawler/cron-workflow.yaml
@@ -0,0 +1,122 @@
+{{- with .Values.urlSanitaryCrawler }}
+{{- if and (.cronWorkflowTemplate.standard.enabled) ($.Values.cronWorkflows.enabled) }}
+kind: CronWorkflow
+apiVersion: argoproj.io/v1alpha1
+metadata:
+  name: {{ .name }}
+spec:
+  workflowSpec:
+    {{- if $.Values.common.runOnGpu }}
+    affinity:
+      nodeAffinity:
+        preferredDuringSchedulingIgnoredDuringExecution:
+        - weight: 1
+          preference:
+            matchExpressions:
+            - key: gpu-node
+              operator: In
+              values:
+              - "true"
+    tolerations:
+    - key: "Gpu"
+      operator: "Equal"
+      value: "true"
+      effect: "NoSchedule"
+    {{- end }}
+    securityContext:
+      {{- include "common.security.podSecurityContext.restricted" dict | nindent 6 }}
+    templates:
+      - name: all
+        inputs:
+          parameters:
+          - name: csv_path
+            default: /etc/welearndatastack/collectors
+          - name: parallelism_treshold
+            default: '500'
+          - name: parallelism_max
+            default: '15'
+          - name: retrieval_mode
+            enum: ["NEW_MODE", "UPDATE_MODE"]
+            default: "NEW_MODE"
+          - name: pick_qty_max
+            default: '1000'
+          - name: log_level
+            default: "INFO"
+          - name: batch_urls_directory
+            default: batch_urls
+          - name: memory_batch_urls
+            default: {{ .workflowTemplates.prepareBatches.requests.memory }}
+          - name: batch_urls_ids_file_name
+            default: "batch_ids.csv"
+          - name: memory_collect_docs
+            default: {{ .workflowTemplates.runNode.requests.memory }}
+          - name: size_limit
+            default: 10000000000 # In bytes
+        steps:
+          - - name: generate-to-sanitize-batch
+              templateRef:
+                name: {{ .name }}-prepare-batches
+                template: generate-to-sanitize-batch
+              arguments:
+                parameters:
+                  - name: log_level
+                    value: >-
+                      {{ print "{{inputs.parameters.log_level}}" }}
+                  - name: parallelism_treshold
+                    value: >-
+                      {{ print "{{inputs.parameters.parallelism_treshold}}" }}
+                  - name: parallelism_max
+                    value: >-
+                      {{ print "{{inputs.parameters.parallelism_max}}" }}
+                  - name: batch_urls_directory
+                    value: >-
+                      {{ print "{{inputs.parameters.batch_urls_directory}}" }}
+                  - name: retrieval_mode
+                    value: >-
+                      {{ print "{{inputs.parameters.retrieval_mode}}" }}
+                  - name: memory
+                    value: >-
+                      {{ print "{{inputs.parameters.memory_batch_urls}}" }}
+                  - name: batch_urls_count_file_name
+                    value: "quantity.txt"
+                  - name: pick_qty_max
+                    value: >-
+                      {{ print "{{inputs.parameters.pick_qty_max}}" }}
+                  - name: output_filename
+                    value: >-
+                      {{ print "{{inputs.parameters.batch_urls_ids_file_name}}" }}
+                  - name: size_limit
+                    value: >-
+                      {{ print "{{inputs.parameters.size_limit}}" }}
+          - - name: url-sanitary-crawler
+              templateRef:
+                name: {{ .name }}-run-node
+                template: url-sanitary-crawler
+              arguments:
+                parameters:
+                  - name: log_level
+                    value: >-
+                      {{ print "{{inputs.parameters.log_level}}" }}
+                  - name: urls_input_filename
+                    value: >-
+                      {{ print "{{item}}_{{inputs.parameters.batch_urls_ids_file_name}}" }}
+                  - name: urls_directory
+                    value: >-
+                      {{ print "{{inputs.parameters.batch_urls_directory}}" }}
+                    default: urls
+                  - name: memory
+                    value: >-
+                      {{ print "{{inputs.parameters.memory_collect_docs}}" }}
+                artifacts:
+                  - name: batch_ids_csv
+                    from: >-
+                      {{ print "{{steps.generate-to-sanitize-batch.outputs.artifacts.batch_ids_csv}}" }}
+              withSequence:
+                count: >-
+                  {{ print "{{steps.generate-to-sanitize-batch.outputs.parameters.url_batches_count}}" }}
+
+    entrypoint: all
+    serviceAccountName: {{ $.Values.common.workflowRbac.serviceAccountName }}
+  schedules: [{{ .cronWorkflowTemplate.standard.schedule }}]
+{{- end }}
+{{- end }}
diff --git a/k8s/welearn-datastack/templates/urlSanitaryCrawler/workflow-template-generate-to-update-wikipedia.yaml b/k8s/welearn-datastack/templates/urlSanitaryCrawler/workflow-template-generate-to-update-wikipedia.yaml
@@ -0,0 +1,122 @@
+{{- with .Values.urlSanitaryCrawler }}
+apiVersion: argoproj.io/v1alpha1
+kind: WorkflowTemplate
+metadata:
+  name: {{ .name }}-prepare-batches
+spec:
+  entrypoint: all
+  serviceAccountName: {{ $.Values.common.workflowRbac.serviceAccountName }}
+  securityContext:
+    {{- include "common.security.podSecurityContext.restricted" dict | nindent 4 }}
+  {{- if $.Values.common.runOnGpu }}
+  affinity:
+    nodeAffinity:
+      preferredDuringSchedulingIgnoredDuringExecution:
+      - weight: 1
+        preference:
+          matchExpressions:
+          - key: gpu-node
+            operator: In
+            values:
+            - "true"
+  tolerations:
+  - key: "Gpu"
+    operator: "Equal"
+    value: "true"
+    effect: "NoSchedule"
+  {{- end }}
+  templates:
+    - name: generate-to-sanitize-batch
+      podSpecPatch: >-
+        {{ print "{\"containers\":[{\"name\":\"main\", \"resources\":{\"limits\": {\"memory\": \"{{inputs.parameters.memory}}\"}, \"requests\": {\"memory\": \"{{inputs.parameters.memory}}\"}}}]}" }}
+      inputs:
+        parameters:
+          - name: parallelism_treshold
+            default: 100000
+          - name: parallelism_max
+            default: 15
+          - name: batch_urls_directory
+            default: batch_urls
+          - name: retrieval_mode
+            enum: ["NEW_MODE", "UPDATE_MODE"]
+            default: "NEW_MODE"
+          - name: pick_qty_max
+            default: 100000
+          - name: memory
+            default: {{ .workflowTemplates.prepareBatches.requests.memory }}
+          - name: batch_urls_count_file_name
+            default: "quantity.txt"
+          - name: output_filename
+            default: "batch_ids.csv"
+          - name: log_level
+            default: "INFO"
+          - name: size_limit
+            default: 10000000000
+      outputs:
+        artifacts:
+          - name: batch_ids_csv
+            path: >-
+              {{ printf "%s/%s/{{inputs.parameters.batch_urls_directory}}" $.Values.common.artifactRoot $.Values.common.artifactOutputFolder }}
+        parameters:
+          - name: url_batches_count
+            valueFrom:
+              path: >-
+                {{ printf "%s/%s/{{inputs.parameters.batch_urls_directory}}/{{inputs.parameters.batch_urls_count_file_name}}" $.Values.common.artifactRoot $.Values.common.artifactOutputFolder }}
+      container:
+      {{- with $.Values.image }}
+        image: {{ tpl (printf "%s/%s:%s" .repository .path .tag) $ }}
+      {{- end }}
+        envFrom:
+          - configMapRef:
+              name: {{ .name }}
+        env:
+          - name: ARTIFACT_ROOT
+            value: {{ $.Values.common.artifactRoot }}
+          - name: IS_LOCAL
+            value: "0"
+          - name: PARALLELISM_THRESHOLD
+            value: >-
+              {{ print "{{inputs.parameters.parallelism_treshold}}" }}
+          - name: PARALLELISM_URL_MAX
+            value: >-
+               {{ print "{{inputs.parameters.parallelism_max}}" }}
+          - name: BATCH_URLS_DIRECTORY
+            value: >-
+               {{ print "{{inputs.parameters.batch_urls_directory}}" }}
+          - name: RETRIEVAL_MODE
+            value: >-
+               {{ print "{{inputs.parameters.retrieval_mode }}" }}
+          - name: PICK_QTY_MAX
+            value: >-
+               {{ print "{{inputs.parameters.pick_qty_max }}" }}
+          - name: LOG_LEVEL
+            value: >-
+              {{ print "{{inputs.parameters.log_level}}" }}
+          - name: OUTPUT_FILE_NAME
+            value: >-
+              {{ print "{{inputs.parameters.output_filename}}" }}
+          - name: SIZE_TOTAL_LIMIT
+            value: >-
+              {{ print "{{inputs.parameters.size_limit}}" }}
+        args:
+          - python
+          - "-m"
+          - welearn_datastack.nodes_workflow.URLSanitaryCrawler.generate_to_sanitize_batch
+
+        volumeMounts:
+        - name: secrets
+          mountPath: "/secrets"
+          readOnly: true
+
+      volumes:
+      - name: secrets
+        secret:
+          secretName: {{ .name }}
+      - name: azure-share
+        csi:
+          driver: file.csi.azure.com
+          readOnly: true
+          volumeAttributes:
+            secretName: {{ $.Values.common.azureShare.secret.name }}
+            shareName: {{ $.Values.common.azureShare.name }}
+{{- end }}
diff --git a/k8s/welearn-datastack/templates/urlSanitaryCrawler/workflow-template-wikipedia-updater.yaml b/k8s/welearn-datastack/templates/urlSanitaryCrawler/workflow-template-wikipedia-updater.yaml
@@ -0,0 +1,79 @@
+{{- with .Values.urlSanitaryCrawler }}
+apiVersion: argoproj.io/v1alpha1
+kind: WorkflowTemplate
+metadata:
+  name: {{ .name }}-run-node
+spec:
+  entrypoint: all
+  serviceAccountName: {{ $.Values.common.workflowRbac.serviceAccountName }}
+  securityContext:
+    {{- include "common.security.podSecurityContext.restricted" dict | nindent 4 }}
+  {{- if $.Values.common.runOnGpu }}
+  affinity:
+    nodeAffinity:
+      preferredDuringSchedulingIgnoredDuringExecution:
+      - weight: 1
+        preference:
+          matchExpressions:
+          - key: gpu-node
+            operator: In
+            values:
+            - "true"
+  tolerations:
+  - key: "Gpu"
+    operator: "Equal"
+    value: "true"
+    effect: "NoSchedule"
+  {{- end }}
+  templates:
+
+    - name: url-sanitary-crawler
+      podSpecPatch: >-
+        {{ print "{\"containers\":[{\"name\":\"main\", \"resources\":{\"requests\": {\"memory\": \"{{inputs.parameters.memory}}\"}}}]}" }}
+      inputs:
+        parameters:
+          - name: log_level
+            default: "INFO"
+          - name: urls_input_filename
+          - name: urls_directory
+            default: urls
+          - name: memory
+            default: {{ .workflowTemplates.runNode.requests.memory }}
+          - name: size_limit
+            default: 10000000000
+        artifacts:
+          - name: batch_ids_csv
+            path: >-
+              {{ printf "%s/%s/{{inputs.parameters.urls_directory}}" $.Values.common.artifactRoot $.Values.common.artifactInputFolder }}
+      container:
+      {{- with $.Values.image }}
+        image: {{ tpl (printf "%s/%s:%s" .repository .path .tag) $ }}
+      {{- end }}
+        envFrom:
+          - configMapRef:
+              name: {{ .name }}
+        env:
+          - name: ARTIFACT_ROOT
+            value: {{ $.Values.common.artifactRoot }}
+          - name: IS_LOCAL
+            value: "0"
+          - name: LOG_LEVEL
+            value: >-
+              {{ print "{{inputs.parameters.log_level}}" }}
+          - name: ARTIFACT_ID_URL_CSV_NAME
+            value: >-
+              {{ print "{{inputs.parameters.urls_directory}}/{{inputs.parameters.urls_input_filename}}" }}
+
+        args:
+          - python
+          - "-m"
+          - welearn_datastack.nodes_workflow.URLSanitaryCrawler.url_sanitary_crawler
+        volumeMounts:
+        - name: secrets
+          mountPath: "/secrets"
+          readOnly: true
+      volumes:
+      - name: secrets
+        secret:
+          secretName: {{ .name }}
+{{- end }}
diff --git a/k8s/welearn-datastack/values.yaml b/k8s/welearn-datastack/values.yaml
@@ -227,6 +227,32 @@ wikipediaUpdater:
       schedule: 40 */1 * * *
 
 
+urlSanitaryCrawler:
+  name: url-sanitary-crawler
+  config:
+    sensitive: {}
+    nonSensitive: {}
+
+  resourcesPrefix: url-sanitary-crawler
+
+  commonLabels:
+    app.kubernetes.io/component: url-sanitary-crawler
+
+  workflowTemplates:
+    prepareBatches:
+      requests:
+        memory: 75M
+    runNode:
+      requests:
+        memory: 20M
+
+  cronWorkflowTemplate:
+    standard:
+      enabled: true
+      schedule: 40 */1 * * *
+
+
+
 qdrantSyncronizer:
   name: qdrant-synchronizer
   config:
diff --git a/welearn_datastack/nodes_workflow/URLSanitaryCrawler/url_sanitary_crawler.py b/welearn_datastack/nodes_workflow/URLSanitaryCrawler/url_sanitary_crawler.py
@@ -59,9 +59,6 @@ def main() -> None:
     )
     logger.info("'%s' WeLearnDocuments were retrieved", len(welearn_documents))
 
-    wlds_ids_to_update: List[Tuple[UUID, int]] = []
-    wlds_ids_to_delete: List[Tuple[UUID, int]] = []
-
     # Check url
     logger.info("Check URL state")
 

Original file line number	Diff line number	Diff line change
`@@ -59,9 +59,6 @@ def main() -> None:`
`59`	`59`	`)`
`60`	`60`	`logger.info("'%s' WeLearnDocuments were retrieved", len(welearn_documents))`
`61`	`61`
`62`		`- wlds_ids_to_update: List[Tuple[UUID, int]] = []`
`63`		`- wlds_ids_to_delete: List[Tuple[UUID, int]] = []`
`64`		`-`
`65`	`62`	`# Check url`
`66`	`63`	`logger.info("Check URL state")`
`67`	`64`