[r351] Block-builder-scheduler: fix bugs in handling of partitions with no commit (#12130) (#12165)

seizethedave · web-flow · commit 95576e237e4b · 2025-07-22T14:40:29.000-07:00
Backport 0a75686 from #12130
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,9 @@
 * [BUGFIX] Distributor: Validate the RW2 symbols field and reject invalid requests that don't have an empty string as the first symbol. #11953
 * [BUGFIX] Distributor: Check `max_inflight_push_requests_bytes` before decompressing incoming requests. #11967
 * [BUGFIX] Query-frontend: Allow limit parameter to be 0 in label queries to explicitly request unlimited results. #12054
+* [BUGFIX] Distributor: Fix a possible panic in the OTLP push path while handling a gRPC status error. #12072
+* [BUGFIX] Query-frontend: Evaluate experimental duration expressions before sharding, splitting, and caching. Otherwise, the result is not correct. #12038
+* [BUGFIX] Block-builder-scheduler: Fix bugs in handling of partitions with no commit. #12130
 
 ### Mixin
 
diff --git a/pkg/blockbuilder/scheduler/metrics.go b/pkg/blockbuilder/scheduler/metrics.go
@@ -11,6 +11,7 @@ type schedulerMetrics struct {
 	updateScheduleDuration   prometheus.Histogram
 	partitionStartOffset     *prometheus.GaugeVec
 	partitionCommittedOffset *prometheus.GaugeVec
+	partitionPlannedOffset   *prometheus.GaugeVec
 	partitionEndOffset       *prometheus.GaugeVec
 	flushFailed              prometheus.Counter
 	fetchOffsetsFailed       prometheus.Counter
@@ -41,6 +42,10 @@ func newSchedulerMetrics(reg prometheus.Registerer) schedulerMetrics {
 			Name: "cortex_blockbuilder_scheduler_partition_committed_offset",
 			Help: "The observed committed offset of each partition.",
 		}, []string{"partition"}),
+		partitionPlannedOffset: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
+			Name: "cortex_blockbuilder_scheduler_partition_planned_offset",
+			Help: "The planned offset of each partition.",
+		}, []string{"partition"}),
 		flushFailed: promauto.With(reg).NewCounter(prometheus.CounterOpts{
 			Name: "cortex_blockbuilder_scheduler_flush_failed_total",
 			Help: "The total number of Kafka flushes that failed.",
diff --git a/pkg/blockbuilder/scheduler/scheduler.go b/pkg/blockbuilder/scheduler/scheduler.go
@@ -251,7 +251,7 @@ func (s *partitionState) updateEndOffset(end int64, ts time.Time, jobSize time.D
 	case bucketBefore:
 		// New bucket is before our current one. This should only happen if our
 		// Kafka's end offsets aren't monotonically increasing.
-		return nil, fmt.Errorf("time went backwards: %s < %s (%d, %d)", s.jobBucket, newJobBucket, s.offset, end)
+		return nil, fmt.Errorf("time went backwards: %s < %s (%d, %d)", newJobBucket, s.jobBucket, s.offset, end)
 	case bucketSame:
 		// Observation is in the currently tracked bucket. No action needed.
 	case bucketAfter:
@@ -401,11 +401,13 @@ func (s *BlockBuilderScheduler) getPartitionState(topic string, partition int32)
 		pendingJobs: list.New(),
 		planned: &advancingOffset{
 			name:    "planned",
+			off:     offsetEmpty,
 			metrics: &s.metrics,
 			logger:  s.logger,
 		},
 		committed: &advancingOffset{
 			name:    "committed",
+			off:     offsetEmpty,
 			metrics: &s.metrics,
 			logger:  s.logger,
 		},
@@ -520,8 +522,9 @@ func (s *BlockBuilderScheduler) consumptionOffsets(ctx context.Context, topic st
 
 			var resumeOffset int64
 
-			if planned := ps.planned.offset(); planned > 0 {
-				s.metrics.partitionCommittedOffset.WithLabelValues(partStr).Set(float64(planned))
+			if !ps.planned.empty() {
+				planned := ps.planned.offset()
+				s.metrics.partitionPlannedOffset.WithLabelValues(partStr).Set(float64(planned))
 				resumeOffset = planned
 			} else {
 				// Nothing planned offset for this partition. Resume from fallback offset instead.
@@ -653,35 +656,44 @@ func (s *BlockBuilderScheduler) fetchCommittedOffsets(ctx context.Context) (kadm
 	return kadm.Offsets{}, lastErr
 }
 
-func (s *BlockBuilderScheduler) snapCommitted() kadm.Offsets {
+// snapOffsets returns a snapshot of the committed and planned offsets for all partitions.
+func (s *BlockBuilderScheduler) snapOffsets() (kadm.Offsets, kadm.Offsets) {
 	cp := make(kadm.Offsets)
+	pp := make(kadm.Offsets)
 
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
 	for _, ps := range s.partState {
-		cp.AddOffset(ps.topic, ps.partition, ps.committed.offset(), 0)
+		if !ps.committed.empty() {
+			cp.AddOffset(ps.topic, ps.partition, ps.committed.offset(), 0)
+		}
+		if !ps.planned.empty() {
+			pp.AddOffset(ps.topic, ps.partition, ps.planned.offset(), 0)
+		}
 	}
 
-	return cp
+	return cp, pp
 }
 
 // flushOffsetsToKafka flushes the committed offsets to Kafka and updates relevant metrics.
 func (s *BlockBuilderScheduler) flushOffsetsToKafka(ctx context.Context) error {
 	// TODO: only flush if dirty.
-	offsets := s.snapCommitted()
+	committed, planned := s.snapOffsets()
 
-	offsets.Each(func(o kadm.Offset) {
+	committed.Each(func(o kadm.Offset) {
 		s.metrics.partitionCommittedOffset.WithLabelValues(fmt.Sprint(o.Partition)).Set(float64(o.At))
 	})
+	planned.Each(func(o kadm.Offset) {
+		s.metrics.partitionPlannedOffset.WithLabelValues(fmt.Sprint(o.Partition)).Set(float64(o.At))
+	})
 
-	err := s.adminClient.CommitAllOffsets(ctx, s.cfg.ConsumerGroup, offsets)
+	err := s.adminClient.CommitAllOffsets(ctx, s.cfg.ConsumerGroup, committed)
 	if err != nil {
 		return fmt.Errorf("commit offsets: %w", err)
 	}
 
-	level.Debug(s.logger).Log("msg", "flushed offsets to Kafka", "offsets", offsetsStr(offsets))
-
+	level.Debug(s.logger).Log("msg", "flushed offsets to Kafka", "offsets", offsetsStr(committed))
 	return nil
 }
 
@@ -958,6 +970,8 @@ type advancingOffset struct {
 	logger  log.Logger
 }
 
+const offsetEmpty int64 = -1
+
 // advance moves the offset forward by the given job spec. Advancements are
 // expected to be monotonically increasing and contiguous. Advance will not
 // allow backwards movement. If a gap is detected, a warning is logged and a
@@ -988,13 +1002,19 @@ func (o *advancingOffset) set(offset int64) {
 	o.off = offset
 }
 
+// empty returns true if the offset is empty and uninitialized.
+func (o *advancingOffset) empty() bool {
+	return o.off == offsetEmpty
+}
+
 // validNextSpec returns true if the given job spec is valid to be added to the
 // offset. It is valid if the start offset is the same as the current offset.
+// We also allow transitioning out of an empty offset without calling it a gap.
 func (o *advancingOffset) validNextSpec(spec schedulerpb.JobSpec) bool {
-	return o.off == spec.StartOffset
+	return o.off == spec.StartOffset || o.empty()
 }
 
 // beyondSpec returns true if the offset is beyond the given job spec.
 func (o *advancingOffset) beyondSpec(spec schedulerpb.JobSpec) bool {
-	return spec.EndOffset <= o.off
+	return !o.empty() && spec.EndOffset <= o.off
 }
diff --git a/pkg/blockbuilder/scheduler/scheduler_test.go b/pkg/blockbuilder/scheduler/scheduler_test.go
@@ -370,23 +370,23 @@ func TestAssignJobSkipsObsoleteOffsets_PriorScheduler(t *testing.T) {
 
 func TestObservations(t *testing.T) {
 	sched, _ := mustScheduler(t, 10)
-	// Initially we're in observation mode. We have Kafka's start offsets, but no client jobs.
+	// Initially we're in observation mode. We have Kafka's commit offsets, but no client jobs.
 
 	sched.getPartitionState("ingest", 1).initCommit(5000)
 	sched.getPartitionState("ingest", 2).initCommit(800)
 	sched.getPartitionState("ingest", 3).initCommit(974)
 	sched.getPartitionState("ingest", 4).initCommit(500)
 	sched.getPartitionState("ingest", 5).initCommit(12000)
-	// no 6
-	// no 7
+	sched.getPartitionState("ingest", 6) // no commit for 6
+	sched.getPartitionState("ingest", 7) // no commit for 7
 	sched.getPartitionState("ingest", 8).initCommit(1000)
 	sched.getPartitionState("ingest", 9).initCommit(1000)
 
 	{
 		nq := newJobQueue(988*time.Hour, noOpJobCreationPolicy[schedulerpb.JobSpec]{}, 2, sched.metrics, test.NewTestingLogger(t))
 		sched.jobs = nq
 		sched.finalizeObservations()
-		require.Len(t, nq.jobs, 0, "No observations, no jobs")
+		require.Empty(t, nq.jobs, "No observations, no jobs")
 	}
 
 	type observation struct {
@@ -448,13 +448,12 @@ func TestObservations(t *testing.T) {
 	mkJob(inProgress, "w103", 5, "ingest/5/12000", 33, 12000, 13000, maybeBadEpoch, errBadEpoch)
 	mkJob(inProgress, "w104", 5, "ingest/5/12000", 34, 12000, 13000, nil, nil)
 
-	// Partition 6 has a complete job, but wasn't among the offsets we learned
-	// from Kafka. We'll drop this as the start offset is nonzero, but not-found
-	// completed jobs are ignored on update.
+	// Partition 6 has a complete job but had no commit at startup. We allow
+	// transitioning from empty to commit to any offset.
 	mkJob(complete, "w0", 6, "ingest/6/500", 48, 500, 600, nil, nil)
-	// Partition 7 has an in-progress job, but wasn't among the offsets we
-	// learned from Kafka. We'll drop this one, too.
-	mkJob(inProgress, "w1", 7, "ingest/7/92874", 52, 92874, 93874, nil, errJobNotFound)
+	// Partition 7 has an in-progress job, but had no commit at startup. We
+	// honor this job and allow it to influence the planned/resumption offset.
+	mkJob(inProgress, "w1", 7, "ingest/7/92874", 52, 92874, 93874, nil, nil)
 
 	// Partition 8 has a number of reports and has a hole that should should not be passed.
 	mkJob(complete, "w0", 8, "ingest/8/1000", 53, 1000, 1100, nil, nil)
@@ -487,11 +486,11 @@ func TestObservations(t *testing.T) {
 					}
 
 					if errors.Is(expectedErr, maybeBadEpoch) {
-						assert.True(t, errors.Is(err, errBadEpoch) || err == nil, "job %V: expected either bad epoch or no error, got %v", c.key, err)
+						require.True(t, errors.Is(err, errBadEpoch) || err == nil, "job %V: expected either bad epoch or no error, got %v", c.key, err)
 					} else if expectedErr != nil {
-						assert.ErrorIs(t, err, expectedErr, "job %V: expected %v, got %v", c.key, expectedErr, err)
+						require.ErrorIs(t, err, expectedErr, "job %V: expected %v, got %v", c.key, expectedErr, err)
 					} else {
-						assert.NoError(t, err, "job %V: expected no error", c.key)
+						require.NoError(t, err, "job %V: expected no error", c.key)
 					}
 				}
 			})
@@ -504,15 +503,17 @@ func TestObservations(t *testing.T) {
 		sched.requireOffset(t, "ingest", 3, 974, "ingest/3 should be unchanged - no updates")
 		sched.requireOffset(t, "ingest", 4, 900, "ingest/4 should be moved forward to account for the completed jobs")
 		sched.requireOffset(t, "ingest", 5, 12000, "ingest/5 has nothing new completed")
-		sched.requireOffset(t, "ingest", 6, 0, "ingest/6 should not have been added to the offsets as there was a gap")
-		sched.requireOffset(t, "ingest", 7, 0, "ingest/7 should not have been added to the offsets as there was a gap")
+		sched.requireOffset(t, "ingest", 6, 600, "ingest/6 allowed to move the commit")
+		sched.requireOffset(t, "ingest", 7, offsetEmpty, "ingest/7 has an in-progress job, but had no commit at startup")
 		sched.requireOffset(t, "ingest", 8, 1300, "ingest/8 should be committed only until the gap")
 		sched.requireOffset(t, "ingest", 9, 1300, "ingest/9 should be committed only until the gap")
 	}
 
 	sendUpdates()
 	sched.completeObservationMode(context.Background())
 
+	verifyCommits()
+
 	// Make sure the resumption offsets account for the gaps.
 	offs, err := sched.consumptionOffsets(context.Background(), "ingest", time.Now())
 	require.NoError(t, err)
@@ -523,15 +524,13 @@ func TestObservations(t *testing.T) {
 		{topic: "ingest", partition: 3, resume: 974},
 		{topic: "ingest", partition: 4, resume: 900},
 		{topic: "ingest", partition: 5, resume: 13000},
-		{topic: "ingest", partition: 6, resume: 0},
-		{topic: "ingest", partition: 7, resume: 0},
+		{topic: "ingest", partition: 6, resume: 600},
+		{topic: "ingest", partition: 7, resume: 93874},
 		{topic: "ingest", partition: 8, resume: 1300},
 		{topic: "ingest", partition: 9, resume: 1300},
 	}, offs)
 
-	verifyCommits()
-
-	require.Len(t, sched.jobs.jobs, 3, "should be 3 in-progress jobs")
+	require.Len(t, sched.jobs.jobs, 4, "should be 4 in-progress jobs")
 	require.Equal(t, 65, int(sched.jobs.epoch))
 
 	// Verify that the same set of updates can be sent now that we're out of
@@ -595,11 +594,17 @@ func TestKafkaFlush(t *testing.T) {
 	ctx := context.Background()
 	sched.completeObservationMode(ctx)
 
-	flushAndRequireOffsets := func(topic string, offsets map[int32]int64, args ...interface{}) {
+	flushAndRequireOffsets := func(topic string, offsets map[int32]int64, args ...any) {
 		require.NoError(t, sched.flushOffsetsToKafka(ctx))
 
 		offs, err := sched.fetchCommittedOffsets(ctx)
 		require.NoError(t, err)
+		offcount := 0
+		offs.Each(func(o kadm.Offset) {
+			offcount++
+		})
+		require.Equal(t, len(offsets), offcount)
+
 		for partition, expected := range offsets {
 			o, ok := offs.Lookup(topic, partition)
 			require.True(t, ok, args...)
@@ -609,24 +614,41 @@ func TestKafkaFlush(t *testing.T) {
 
 	flushAndRequireOffsets("ingest", map[int32]int64{}, "no group found -> no offsets")
 
+	_ = sched.getPartitionState("ingest", 0)
+	// (No commit yet for p0.)
+
 	p1 := sched.getPartitionState("ingest", 1)
-	p1.committed.set(2000)
+	p1.initCommit(2000)
 	flushAndRequireOffsets("ingest", map[int32]int64{
 		1: 2000,
 	})
 
 	p4 := sched.getPartitionState("ingest", 4)
-	p4.committed.set(65535)
+	p4.initCommit(65535)
 	flushAndRequireOffsets("ingest", map[int32]int64{
 		1: 2000,
 		4: 65535,
 	})
 
-	p1.committed.set(4000)
+	p1.initCommit(4000)
 	flushAndRequireOffsets("ingest", map[int32]int64{
 		1: 4000,
 		4: 65535,
 	}, "should be able to advance an existing offset")
+
+	reg := sched.register.(*prometheus.Registry)
+	require.NoError(t, promtest.GatherAndCompare(reg, strings.NewReader(
+		`# HELP cortex_blockbuilder_scheduler_partition_committed_offset The observed committed offset of each partition.
+		# TYPE cortex_blockbuilder_scheduler_partition_committed_offset gauge
+		cortex_blockbuilder_scheduler_partition_committed_offset{partition="1"} 4000
+		cortex_blockbuilder_scheduler_partition_committed_offset{partition="4"} 65535
+	`), "cortex_blockbuilder_scheduler_partition_committed_offset"), "should only modify commit gauge for non-empty commit offsets")
+	require.NoError(t, promtest.GatherAndCompare(reg, strings.NewReader(
+		`# HELP cortex_blockbuilder_scheduler_partition_planned_offset The planned offset of each partition.
+		# TYPE cortex_blockbuilder_scheduler_partition_planned_offset gauge
+		cortex_blockbuilder_scheduler_partition_planned_offset{partition="1"} 4000
+		cortex_blockbuilder_scheduler_partition_planned_offset{partition="4"} 65535
+	`), "cortex_blockbuilder_scheduler_partition_planned_offset"), "should only modify planned gauge for non-empty planned offsets")
 }
 
 func TestUpdateSchedule(t *testing.T) {
@@ -1235,8 +1257,7 @@ func TestBlockBuilderScheduler_EnqueuePendingJobs_GapDetection(t *testing.T) {
 
 	assert.Equal(t, 3, pt.pendingJobs.Len())
 	assert.Equal(t, 0, sched.jobs.count())
-	pt.planned.offset()
-	assert.Equal(t, int64(0), pt.planned.offset())
+	assert.True(t, pt.planned.empty())
 	sched.enqueuePendingJobs()
 	assert.Equal(t, 0, pt.pendingJobs.Len())
 	assert.Equal(t, 3, sched.jobs.count())
@@ -1293,3 +1314,44 @@ func TestBlockBuilderScheduler_EnqueuePendingJobs_GapDetection(t *testing.T) {
 		requireGaps(t, reg, part, 2, commitGaps, "expected %d commit gaps at job %d", commitGaps, j)
 	}
 }
+
+func TestBlockBuilderScheduler_NoCommit_NoGap(t *testing.T) {
+	sched, _ := mustScheduler(t, 4)
+	reg := sched.register.(*prometheus.Registry)
+
+	const part int32 = 1
+	requireGaps(t, reg, part, 0, 0)
+
+	pp := sched.getPartitionState("ingest", part)
+	require.True(t, pp.planned.empty())
+	require.True(t, pp.committed.empty())
+
+	k := jobKey{"myjob5", 5}
+	spec := schedulerpb.JobSpec{
+		Topic:       "ingest",
+		Partition:   part,
+		StartOffset: 10,
+		EndOffset:   20,
+	}
+
+	pp.planned.advance(k, spec)
+	requireGaps(t, reg, part, 0, 0, "advancing an empty planned offset should not register a gap")
+
+	pp.committed.advance(k, spec)
+	requireGaps(t, reg, part, 0, 0, "advancing an empty committed offset should not register a gap")
+
+	// Now create a gap:
+	k2 := jobKey{"myjob7", 23}
+	spec2 := schedulerpb.JobSpec{
+		Topic:       "ingest",
+		Partition:   part,
+		StartOffset: 40,
+		EndOffset:   50,
+	}
+
+	pp.planned.advance(k2, spec2)
+	requireGaps(t, reg, part, 1, 0, "a gap after a non-empty planned offset should register a gap")
+
+	pp.committed.advance(k2, spec2)
+	requireGaps(t, reg, part, 1, 1, "a gap after a non-empty committed offset should register a gap")
+}