atlanhq
diff --git a/‎.github/workflows/custom-build-image.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/custom-build-image.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pkg/scalers/temporal_scaler.go‎
Lines changed: 250 additions & 15 deletions b/‎pkg/scalers/temporal_scaler.go‎
Lines changed: 250 additions & 15 deletions
@@ -4,6 +4,7 @@ on:
   push:
     branches:
     - 2.18.3-main
+    - feature/temporal-scaler-worker-slots-metric
 
 jobs:
   build:
 
@@ -4,36 +4,89 @@ import (
 	"context"
 	"crypto/tls"
 	"fmt"
+	"io"
 	"log/slog"
+	"net"
+	"net/http"
+	"strconv"
 	"strings"
+	"sync"
 	"time"
 
 	"github.com/go-logr/logr"
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+	"github.com/prometheus/common/expfmt"
 	workflowservice "go.temporal.io/api/workflowservice/v1"
 	sdk "go.temporal.io/sdk/client"
 	sdklog "go.temporal.io/sdk/log"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/metadata"
 	v2 "k8s.io/api/autoscaling/v2"
+	corev1 "k8s.io/api/core/v1"
 	"k8s.io/metrics/pkg/apis/external_metrics"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
 
 	"github.com/kedacore/keda/v2/pkg/scalers/scalersconfig"
 	kedautil "github.com/kedacore/keda/v2/pkg/util"
 )
 
+const (
+	// scrapeLoopTimeout is the total time budget for scraping all worker pods.
+	scrapeLoopTimeout = 12 * time.Second
+	// slotsCacheTTL is how long a cached slots value remains valid after a
+	// successful scrape before falling back to 0 on persistent failure.
+	slotsCacheTTL = 180 * time.Second
+	// maxMetricsResponseBytes limits the size of a single pod's /metrics response
+	// to prevent OOM from misconfigured or malicious pods.
+	maxMetricsResponseBytes = 10 * 1024 * 1024 // 10 MB
+)
+
 var (
 	temporalDefauleQueueTypes = []sdk.TaskQueueType{
 		sdk.TaskQueueTypeActivity,
 		sdk.TaskQueueTypeWorkflow,
 		sdk.TaskQueueTypeNexus,
 	}
+
+	// temporalSlotsScrapeErrors counts worker slot scrape failures by reason:
+	//   pod_scrape_error              – a single pod's /metrics request failed
+	//   scrape_loop_timeout           – 12s budget exceeded, used partial results
+	//   all_pods_failed_cache_hit     – all pods failed; returned last cached value
+	//   all_pods_failed_cache_expired – all pods failed and cache expired; returned 0
+	temporalSlotsScrapeErrors = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Namespace: "keda",
+			Subsystem: "temporal_scaler",
+			Name:      "worker_slots_scrape_errors_total",
+			Help: "Total number of temporal worker slot scrape failures. " +
+				"Use reason label to distinguish pod-level errors from full-scrape failures.",
+		},
+		[]string{"namespace", "task_queue", "reason"},
+	)
 )
 
+func init() {
+	ctrlmetrics.Registry.MustRegister(temporalSlotsScrapeErrors)
+}
+
+// slotsCache holds the last successful slots scrape result.
+type slotsCache struct {
+	value     int64
+	timestamp time.Time
+}
+
 type temporalScaler struct {
-	metricType v2.MetricTargetType
-	metadata   *temporalMetadata
-	tcl        sdk.Client
-	logger     logr.Logger
+	metricType   v2.MetricTargetType
+	metadata     *temporalMetadata
+	tcl          sdk.Client
+	kubeClient   client.Client
+	httpClient   *http.Client
+	logger       logr.Logger
+	podNamespace string
+	slotsMu      sync.Mutex
+	lastSlots    slotsCache
 }
 
 type temporalMetadata struct {
@@ -48,6 +101,7 @@ type temporalMetadata struct {
 	Unversioned                 bool     `keda:"name=selectUnversioned,         order=triggerMetadata, default=false"`
 	IncludeRunningWorkflowCount bool     `keda:"name=includeRunningWorkflowCount, order=triggerMetadata, default=true"`
 	WorkflowTaskQueueForCount   string   `keda:"name=workflowTaskQueueForCount,   order=triggerMetadata;resolvedEnv, optional"`
+	WorkerMetricsPort           int      `keda:"name=workerMetricsPort,           order=triggerMetadata, default=9464"`
 	APIKey                      string   `keda:"name=apiKey,                    order=authParams;resolvedEnv, optional"`
 	MinConnectTimeout           int      `keda:"name=minConnectTimeout,         order=triggerMetadata, default=5"`
 
@@ -77,10 +131,14 @@ func (a *temporalMetadata) Validate() error {
 		return fmt.Errorf("minConnectTimeout must be a positive number")
 	}
 
+	if a.WorkerMetricsPort < 1 || a.WorkerMetricsPort > 65535 {
+		return fmt.Errorf("workerMetricsPort must be between 1 and 65535")
+	}
+
 	return nil
 }
 
-func NewTemporalScaler(ctx context.Context, config *scalersconfig.ScalerConfig) (Scaler, error) {
+func NewTemporalScaler(ctx context.Context, kubeClient client.Client, config *scalersconfig.ScalerConfig) (Scaler, error) {
 	logger := InitializeLogger(config, "temporal_scaler")
 
 	metricType, err := GetMetricTargetType(config)
@@ -99,10 +157,13 @@ func NewTemporalScaler(ctx context.Context, config *scalersconfig.ScalerConfig)
 	}
 
 	return &temporalScaler{
-		metricType: metricType,
-		metadata:   meta,
-		tcl:        c,
-		logger:     logger,
+		metricType:   metricType,
+		metadata:     meta,
+		tcl:          c,
+		kubeClient:   kubeClient,
+		httpClient:   kedautil.CreateHTTPClient(config.GlobalHTTPTimeout, false),
+		logger:       logger,
+		podNamespace: config.ScalableObjectNamespace,
 	}, nil
 }
 
@@ -164,18 +225,25 @@ func (s *temporalScaler) getQueueSize(ctx context.Context) (int64, error) {
 	}
 
 	backlog := getCombinedBacklogCount(resp)
+	metric := backlog
 
-	if !s.metadata.IncludeRunningWorkflowCount {
-		return backlog, nil
+	if s.metadata.IncludeRunningWorkflowCount {
+		runningCount, err := s.getRunningWorkflowCount(ctx)
+		if err != nil {
+			s.logger.V(1).Info("failed to get running workflow count, using backlog only", "error", err)
+		} else {
+			metric += runningCount
+		}
 	}
 
-	runningCount, err := s.getRunningWorkflowCount(ctx)
+	usedSlots, err := s.getUsedWorkerSlots(ctx)
 	if err != nil {
-		s.logger.V(1).Info("failed to get running workflow count, using backlog only", "error", err)
-		return backlog, nil
+		s.logger.Info("failed to get worker slots metric, excluding from metric", "error", err)
+	} else {
+		metric += usedSlots
 	}
 
-	return backlog + runningCount, nil
+	return metric, nil
 }
 
 // getRunningWorkflowCount returns the approximate number of running workflow executions
@@ -201,6 +269,173 @@ func (s *temporalScaler) getRunningWorkflowCount(ctx context.Context) (int64, er
 	return resp.GetCount(), nil
 }
 
+// getUsedWorkerSlots discovers worker pods in the ScaledObject's namespace and
+// scrapes their Prometheus metrics endpoint to sum temporal_worker_task_slots_used
+// for worker types matching the configured queueTypes. This prevents premature
+// scale-down when workers are actively executing tasks but the task queue backlog
+// is empty.
+//
+// On transient failures (all pod scrapes fail), it returns the last known good
+// value if within the cache TTL. A total timeout budget bounds the scrape loop
+// so that slow/unreachable pods don't block the KEDA polling cycle.
+func (s *temporalScaler) getUsedWorkerSlots(ctx context.Context) (int64, error) {
+	if s.kubeClient == nil || s.httpClient == nil {
+		return 0, fmt.Errorf("kubernetes client or http client not configured")
+	}
+
+	podList := &corev1.PodList{}
+	labelSelector := client.MatchingLabels{"app.kubernetes.io/component": "worker"}
+	if err := s.kubeClient.List(ctx, podList, client.InNamespace(s.podNamespace), labelSelector); err != nil {
+		return 0, fmt.Errorf("failed to list worker pods in namespace %s: %w", s.podNamespace, err)
+	}
+
+	if len(podList.Items) == 0 {
+		return 0, nil
+	}
+
+	// Apply a timeout budget for the entire scrape loop.
+	scrapeCtx, cancel := context.WithTimeout(ctx, scrapeLoopTimeout)
+	defer cancel()
+
+	var totalUsedSlots int64
+	var scrapedCount int
+	for i := range podList.Items {
+		pod := &podList.Items[i]
+		if pod.Status.Phase != corev1.PodRunning || pod.Status.PodIP == "" || !isPodReady(pod) {
+			continue
+		}
+
+		// Stop scraping if we've exceeded the timeout budget.
+		if scrapeCtx.Err() != nil {
+			s.logger.Info("scrape loop timeout reached, using partial results",
+				"scraped", scrapedCount, "remaining", len(podList.Items)-i)
+			temporalSlotsScrapeErrors.WithLabelValues(s.podNamespace, s.metadata.TaskQueue, "scrape_loop_timeout").Inc()
+			break
+		}
+
+		slots, err := s.scrapeWorkerSlots(scrapeCtx, pod.Status.PodIP)
+		if err != nil {
+			s.logger.Info("failed to scrape worker pod metrics, skipping",
+				"pod", pod.Name, "ip", pod.Status.PodIP, "error", err)
+			temporalSlotsScrapeErrors.WithLabelValues(s.podNamespace, s.metadata.TaskQueue, "pod_scrape_error").Inc()
+			continue
+		}
+		totalUsedSlots += slots
+		scrapedCount++
+	}
+
+	s.logger.V(1).Info("worker slots metric",
+		"namespace", s.podNamespace, "totalUsedSlots", totalUsedSlots,
+		"podCount", len(podList.Items), "scrapedCount", scrapedCount)
+
+	// If no pods could be scraped, fall back to cached value within TTL.
+	if scrapedCount == 0 {
+		s.slotsMu.Lock()
+		cached := s.lastSlots
+		s.slotsMu.Unlock()
+		if time.Since(cached.timestamp) <= slotsCacheTTL {
+			s.logger.Info("all scrapes failed, using cached slots value",
+				"cachedValue", cached.value, "cacheAge", time.Since(cached.timestamp).String())
+			temporalSlotsScrapeErrors.WithLabelValues(s.podNamespace, s.metadata.TaskQueue, "all_pods_failed_cache_hit").Inc()
+			return cached.value, nil
+		}
+		s.logger.Info("all scrapes failed and cache expired, returning 0")
+		temporalSlotsScrapeErrors.WithLabelValues(s.podNamespace, s.metadata.TaskQueue, "all_pods_failed_cache_expired").Inc()
+		return 0, nil
+	}
+
+	// Update cache with the fresh value.
+	s.slotsMu.Lock()
+	s.lastSlots = slotsCache{value: totalUsedSlots, timestamp: time.Now()}
+	s.slotsMu.Unlock()
+
+	return totalUsedSlots, nil
+}
+
+// scrapeWorkerSlots fetches Prometheus metrics from a single worker pod and returns
+// the sum of temporal_worker_task_slots_used for worker types matching the
+// configured queueTypes and task queue.
+func (s *temporalScaler) scrapeWorkerSlots(ctx context.Context, podIP string) (int64, error) {
+	hostPort := net.JoinHostPort(podIP, strconv.Itoa(s.metadata.WorkerMetricsPort))
+	url := fmt.Sprintf("http://%s/metrics", hostPort)
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+	if err != nil {
+		return 0, fmt.Errorf("create request: %w", err)
+	}
+
+	resp, err := s.httpClient.Do(req)
+	if err != nil {
+		return 0, fmt.Errorf("scrape %s: %w", url, err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return 0, fmt.Errorf("scrape %s returned status %d", url, resp.StatusCode)
+	}
+
+	limitedBody := io.LimitReader(resp.Body, maxMetricsResponseBytes)
+	// Only count ActivityWorker slots. WorkflowWorker slots always report >= 1
+	// due to Temporal SDK's sticky workflow cache and are not meaningful for
+	// scaling decisions. queueTypes still controls backlog counting via
+	// getQueueTypes/DescribeTaskQueueEnhanced.
+	activityOnly := map[string]bool{"ActivityWorker": true}
+	return parseUsedSlots(limitedBody, s.metadata.TaskQueue, activityOnly)
+}
+
+// parseUsedSlots parses Prometheus text format and extracts the sum of
+// temporal_worker_task_slots_used for the given worker types matching the task queue.
+func parseUsedSlots(r io.Reader, taskQueue string, workerTypes map[string]bool) (int64, error) {
+	var parser expfmt.TextParser
+	families, err := parser.TextToMetricFamilies(r)
+	if err != nil {
+		return 0, fmt.Errorf("parse prometheus metrics: %w", err)
+	}
+
+	family, ok := families["temporal_worker_task_slots_used"]
+	if !ok {
+		return 0, nil
+	}
+
+	var total int64
+	for _, m := range family.GetMetric() {
+		if matchesWorkerSlot(m, taskQueue, workerTypes) {
+			total += int64(m.GetGauge().GetValue())
+		}
+	}
+	return total, nil
+}
+
+// matchesWorkerSlot returns true if the metric's worker_type is in the allowed set
+// and (if taskQueue is non-empty) task_queue matches the configured queue.
+func matchesWorkerSlot(m *dto.Metric, taskQueue string, workerTypes map[string]bool) bool {
+	var typeMatches bool
+	// Empty taskQueue matches all queues (including metrics missing the label).
+	queueMatches := taskQueue == ""
+	for _, lp := range m.GetLabel() {
+		switch lp.GetName() {
+		case "worker_type":
+			typeMatches = workerTypes[lp.GetValue()]
+		case "task_queue":
+			if !queueMatches {
+				queueMatches = lp.GetValue() == taskQueue
+			}
+		}
+	}
+	return typeMatches && queueMatches
+}
+
+// isPodReady returns true if the pod has a Ready condition set to True.
+// Pods that aren't ready yet (e.g. during startup before the readiness probe
+// passes) likely don't have their metrics endpoint available.
+func isPodReady(pod *corev1.Pod) bool {
+	for _, c := range pod.Status.Conditions {
+		if c.Type == corev1.PodReady {
+			return c.Status == corev1.ConditionTrue
+		}
+	}
+	return false
+}
+
 func getQueueTypes(queueTypes []string) []sdk.TaskQueueType {
 	var taskQueueTypes []sdk.TaskQueueType
 	for _, t := range queueTypes {