dagu-org
diff --git a/‎charts/dagu/README.md‎
Lines changed: 60 additions & 8 deletions b/‎charts/dagu/README.md‎
Lines changed: 60 additions & 8 deletions
diff --git a/‎charts/dagu/templates/NOTES.txt‎
Lines changed: 5 additions & 1 deletion b/‎charts/dagu/templates/NOTES.txt‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎charts/dagu/templates/_helpers.tpl‎
Lines changed: 12 additions & 0 deletions b/‎charts/dagu/templates/_helpers.tpl‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎charts/dagu/templates/configmap.yaml‎
Lines changed: 1 addition & 0 deletions b/‎charts/dagu/templates/configmap.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎charts/dagu/templates/worker-deployment.yaml‎
Lines changed: 35 additions & 10 deletions b/‎charts/dagu/templates/worker-deployment.yaml‎
Lines changed: 35 additions & 10 deletions
diff --git a/‎charts/dagu/values.yaml‎
Lines changed: 17 additions & 10 deletions b/‎charts/dagu/values.yaml‎
Lines changed: 17 additions & 10 deletions
diff --git a/‎internal/cmd/context.go‎
Lines changed: 1 addition & 1 deletion b/‎internal/cmd/context.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎internal/cmd/dry.go‎
Lines changed: 1 addition & 0 deletions b/‎internal/cmd/dry.go‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎internal/cmd/restart.go‎
Lines changed: 1 addition & 0 deletions b/‎internal/cmd/restart.go‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎internal/cmd/retry.go‎
Lines changed: 1 addition & 0 deletions b/‎internal/cmd/retry.go‎
Lines changed: 1 addition & 0 deletions
@@ -34,7 +34,7 @@ The chart deploys four components:
 
 - **Coordinator**: gRPC server for distributed task execution (port 50055)
 - **Scheduler**: Manages DAG execution schedules (port 8090 for health)
-- **Worker**: Executes DAG steps (2 replicas by default)
+- **Worker**: Executes DAG steps (configurable pools with independent replicas)
 - **UI**: Web interface for managing DAGs (port 8080)
 
 All components share a single PersistentVolumeClaim with `ReadWriteMany` access mode.
@@ -56,7 +56,57 @@ For local single-node clusters that don't support RWX:
 helm install dagu charts/dagu \
   --set persistence.accessMode=ReadWriteOnce \
   --set persistence.skipValidation=true \
-  --set worker.replicas=1
+  --set workerPools.general.replicas=1
+```
+
+### Worker Pools
+
+Workers are organized into pools. Each pool creates a separate Kubernetes Deployment with its own replicas, labels, resources, and scheduling constraints. DAGs select workers via `workerSelector` labels that match a pool's labels.
+
+```yaml
+workerPools:
+  general:
+    replicas: 2
+    labels: {}
+    resources:
+      requests:
+        memory: "128Mi"
+        cpu: "100m"
+      limits:
+        memory: "256Mi"
+        cpu: "200m"
+    nodeSelector: {}
+    tolerations: []
+    affinity: {}
+
+  gpu:
+    replicas: 1
+    labels:
+      gpu: "true"
+    resources:
+      requests:
+        memory: "512Mi"
+        cpu: "500m"
+        nvidia.com/gpu: "1"
+      limits:
+        memory: "1Gi"
+        cpu: "1000m"
+        nvidia.com/gpu: "1"
+    nodeSelector:
+      nvidia.com/gpu.present: "true"
+    tolerations:
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+    affinity: {}
+```
+
+A pool with `labels: {}` (like `general` above) matches any DAG that has no `workerSelector`. To route a DAG to a specific pool, set `workerSelector` in the DAG definition to match the pool's labels:
+
+```yaml
+# In your DAG file
+workerSelector:
+  gpu: "true"
 ```
 
 ### Authentication
@@ -101,12 +151,14 @@ scheduler:
       memory: "256Mi"
       cpu: "250m"
 
-worker:
-  replicas: 2
-  resources:
-    requests:
-      memory: "128Mi"
-      cpu: "100m"
+workerPools:
+  general:
+    replicas: 2
+    labels: {}
+    resources:
+      requests:
+        memory: "128Mi"
+        cpu: "100m"
 
 ui:
   replicas: 1
 
@@ -5,7 +5,11 @@ Make sure your storage class supports RWX access mode (e.g., NFS, EFS, CephFS).
 
 Components deployed:
   Scheduler: {{ .Values.scheduler.replicas }} replica(s)
-  Worker: {{ .Values.worker.replicas }} replica(s)
+  Worker Pools:
+  {{- range $poolName, $pool := .Values.workerPools }}
+    - {{ $poolName }}: {{ $pool.replicas }} replica(s)
+      {{- if $pool.labels }} (labels: {{ include "dagu.workerLabels" $pool.labels }}){{- end }}
+  {{- end }}
   UI: {{ .Values.ui.replicas }} replica(s)
 
 Access the UI:
 
@@ -22,3 +22,15 @@ app.kubernetes.io/managed-by: {{ .Release.Service }}
 helm.sh/chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
 {{- end }}
 
+{{- define "dagu.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "dagu.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+{{- define "dagu.workerLabels" -}}
+{{- $pairs := list -}}
+{{- range $key, $value := . -}}
+{{- $pairs = append $pairs (printf "%s=%v" $key $value) -}}
+{{- end -}}
+{{- join "," $pairs -}}
+{{- end }}
@@ -15,6 +15,7 @@ data:
     host: "0.0.0.0"
     port: 8080
     apiBasePath: "/api/v1"
+    defaultExecutionMode: "distributed"
 
     # Coordinator (distributed execution)
     coordinator:
 
@@ -1,34 +1,46 @@
+{{- range $poolName, $pool := .Values.workerPools }}
+{{- if not (regexMatch "^[a-z][a-z0-9-]*$" $poolName) }}
+{{- fail (printf "invalid workerPool name %q: must match ^[a-z][a-z0-9-]*$" $poolName) }}
+{{- end }}
+---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: {{ include "dagu.fullname" . }}-worker
+  name: {{ include "dagu.fullname" $ }}-worker-{{ $poolName }}
   labels:
-    {{- include "dagu.labels" . | nindent 4 }}
+    {{- include "dagu.labels" $ | nindent 4 }}
     app.kubernetes.io/component: worker
+    dagu.io/worker-pool: {{ $poolName }}
 spec:
-  replicas: {{ .Values.worker.replicas }}
+  replicas: {{ $pool.replicas }}
   selector:
     matchLabels:
-      {{- include "dagu.labels" . | nindent 6 }}
+      {{- include "dagu.selectorLabels" $ | nindent 6 }}
       app.kubernetes.io/component: worker
+      dagu.io/worker-pool: {{ $poolName }}
   template:
     metadata:
       labels:
-        {{- include "dagu.labels" . | nindent 8 }}
+        {{- include "dagu.labels" $ | nindent 8 }}
         app.kubernetes.io/component: worker
+        dagu.io/worker-pool: {{ $poolName }}
     spec:
       # Disable Kubernetes Service env var injection to avoid overriding
       # dagu config values (e.g., scheduler.port) with Service URLs
       enableServiceLinks: false
       containers:
         - name: worker
-          image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
-          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          image: "{{ $.Values.image.repository }}:{{ $.Values.image.tag }}"
+          imagePullPolicy: {{ $.Values.image.pullPolicy }}
           command:
             - dagu
             - worker
             - --config
             - /etc/dagu/dagu.yaml
+            {{- if $pool.labels }}
+            - --worker.labels
+            - {{ include "dagu.workerLabels" $pool.labels | quote }}
+            {{- end }}
           env:
             - name: WORKER_ID
               valueFrom:
@@ -40,11 +52,24 @@ spec:
             - name: config
               mountPath: /etc/dagu
           resources:
-            {{- toYaml .Values.worker.resources | nindent 12 }}
+            {{- toYaml $pool.resources | nindent 12 }}
+      {{- with $pool.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with $pool.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with $pool.affinity }}
+      affinity:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
       volumes:
         - name: data
           persistentVolumeClaim:
-            claimName: {{ include "dagu.fullname" . }}-data
+            claimName: {{ include "dagu.fullname" $ }}-data
         - name: config
           configMap:
-            name: {{ include "dagu.fullname" . }}-config
+            name: {{ include "dagu.fullname" $ }}-config
+{{- end }}
@@ -29,16 +29,23 @@ coordinator:
       memory: "256Mi"
       cpu: "200m"
 
-# Worker configuration
-worker:
-  replicas: 2
-  resources:
-    requests:
-      memory: "128Mi"
-      cpu: "100m"
-    limits:
-      memory: "256Mi"
-      cpu: "200m"
+# Worker pool configuration
+# Each pool creates a separate Kubernetes Deployment.
+# Pool names become part of the Deployment name: <release>-dagu-worker-<poolName>
+workerPools:
+  general:
+    replicas: 2
+    labels: {}
+    resources:
+      requests:
+        memory: "128Mi"
+        cpu: "100m"
+      limits:
+        memory: "256Mi"
+        cpu: "200m"
+    nodeSelector: {}
+    tolerations: []
+    affinity: {}
 
 # UI configuration
 ui:
 
@@ -335,7 +335,7 @@ func (c *Context) NewScheduler() (*scheduler.Scheduler, error) {
 	}
 
 	coordinatorCli := c.NewCoordinatorClient()
-	de := scheduler.NewDAGExecutor(coordinatorCli, runtime.NewSubCmdBuilder(c.Config))
+	de := scheduler.NewDAGExecutor(coordinatorCli, runtime.NewSubCmdBuilder(c.Config), c.Config.DefaultExecMode)
 	m := scheduler.NewEntryReader(c.Config.Paths.DAGsDir, dr, c.DAGRunMgr, de, c.Config.Paths.Executable)
 	return scheduler.New(c.Config, m, c.DAGRunMgr, c.DAGRunStore, c.QueueStore, c.ProcStore, c.ServiceRegistry, coordinatorCli)
 }
 
@@ -81,6 +81,7 @@ func runDry(ctx *Context, args []string) error {
 			ServiceRegistry: ctx.ServiceRegistry,
 			RootDAGRun:      exec.NewDAGRunRef(dag.Name, dagRunID),
 			PeerConfig:      ctx.Config.Core.Peer,
+			DefaultExecMode: ctx.Config.DefaultExecMode,
 		},
 	)
 
 
@@ -161,6 +161,7 @@ func executeDAGWithRunID(ctx *Context, cli runtime.Manager, dag *core.DAG, dagRu
 			ServiceRegistry: ctx.ServiceRegistry,
 			RootDAGRun:      exec.NewDAGRunRef(dag.Name, dagRunID),
 			PeerConfig:      ctx.Config.Core.Peer,
+			DefaultExecMode: ctx.Config.DefaultExecMode,
 		})
 
 	listenSignals(ctx, agentInstance)
 
@@ -142,6 +142,7 @@ func executeRetry(ctx *Context, dag *core.DAG, status *exec.DAGRunStatus, rootRu
 			RootDAGRun:      rootRun,
 			PeerConfig:      ctx.Config.Core.Peer,
 			TriggerType:     core.TriggerTypeRetry,
+			DefaultExecMode: ctx.Config.DefaultExecMode,
 		},
 	)
Original file line number	Diff line number	Diff line change
`@@ -335,7 +335,7 @@ func (c Context) NewScheduler() (scheduler.Scheduler, error) {`
`335`	`335`	`}`
`336`	`336`
`337`	`337`	`coordinatorCli := c.NewCoordinatorClient()`
`338`		`- de := scheduler.NewDAGExecutor(coordinatorCli, runtime.NewSubCmdBuilder(c.Config))`
	`338`	`+ de := scheduler.NewDAGExecutor(coordinatorCli, runtime.NewSubCmdBuilder(c.Config), c.Config.DefaultExecMode)`
`339`	`339`	`m := scheduler.NewEntryReader(c.Config.Paths.DAGsDir, dr, c.DAGRunMgr, de, c.Config.Paths.Executable)`
`340`	`340`	`return scheduler.New(c.Config, m, c.DAGRunMgr, c.DAGRunStore, c.QueueStore, c.ProcStore, c.ServiceRegistry, coordinatorCli)`
`341`	`341`	`}`
Original file line number	Diff line number	Diff line change
`@@ -81,6 +81,7 @@ func runDry(ctx *Context, args []string) error {`
`81`	`81`	`ServiceRegistry: ctx.ServiceRegistry,`
`82`	`82`	`RootDAGRun: exec.NewDAGRunRef(dag.Name, dagRunID),`
`83`	`83`	`PeerConfig: ctx.Config.Core.Peer,`
	`84`	`+ DefaultExecMode: ctx.Config.DefaultExecMode,`
`84`	`85`	`},`
`85`	`86`	`)`
`86`	`87`
Original file line number	Diff line number	Diff line change
`@@ -142,6 +142,7 @@ func executeRetry(ctx Context, dag core.DAG, status *exec.DAGRunStatus, rootRu`
`142`	`142`	`RootDAGRun: rootRun,`
`143`	`143`	`PeerConfig: ctx.Config.Core.Peer,`
`144`	`144`	`TriggerType: core.TriggerTypeRetry,`
	`145`	`+ DefaultExecMode: ctx.Config.DefaultExecMode,`
`145`	`146`	`},`
`146`	`147`	`)`
`147`	`148`