Skip to content
This repository was archived by the owner on Aug 13, 2019. It is now read-only.

Commit dfed85e

Browse files
authored
Keep series that are still in WAL in checkpoints (#577)
If all the samples are deleted for a series, we should still keep the series in the WAL as anything else reading the WAL will still care about it in order to understand the samples. Signed-off-by: Brian Brazil <[email protected]>
1 parent 259847a commit dfed85e

File tree

2 files changed

+88
-1
lines changed

2 files changed

+88
-1
lines changed

head.go

+37-1
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ type Head struct {
7575
symbols map[string]struct{}
7676
values map[string]stringset // label names to possible values
7777

78+
deletedMtx sync.Mutex
79+
deleted map[uint64]int // Deleted series, and what WAL segment they must be kept until.
80+
7881
postings *index.MemPostings // postings lists for terms
7982
}
8083

@@ -234,6 +237,7 @@ func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, chunkRange int
234237
values: map[string]stringset{},
235238
symbols: map[string]struct{}{},
236239
postings: index.NewUnorderedMemPostings(),
240+
deleted: map[uint64]int{},
237241
}
238242
h.metrics = newHeadMetrics(h, r)
239243

@@ -557,7 +561,13 @@ func (h *Head) Truncate(mint int64) (err error) {
557561
}
558562

559563
keep := func(id uint64) bool {
560-
return h.series.getByID(id) != nil
564+
if h.series.getByID(id) != nil {
565+
return true
566+
}
567+
h.deletedMtx.Lock()
568+
_, ok := h.deleted[id]
569+
h.deletedMtx.Unlock()
570+
return ok
561571
}
562572
h.metrics.checkpointCreationTotal.Inc()
563573
if _, err = Checkpoint(h.wal, first, last, keep, mint); err != nil {
@@ -570,6 +580,17 @@ func (h *Head) Truncate(mint int64) (err error) {
570580
// that supersedes them.
571581
level.Error(h.logger).Log("msg", "truncating segments failed", "err", err)
572582
}
583+
584+
// The checkpoint is written and segments before it is truncated, so we no
585+
// longer need to track deleted series that are before it.
586+
h.deletedMtx.Lock()
587+
for ref, segment := range h.deleted {
588+
if segment < first {
589+
delete(h.deleted, ref)
590+
}
591+
}
592+
h.deletedMtx.Unlock()
593+
573594
h.metrics.checkpointDeleteTotal.Inc()
574595
if err := DeleteCheckpoints(h.wal.Dir(), last); err != nil {
575596
// Leftover old checkpoints do not cause problems down the line beyond
@@ -953,6 +974,21 @@ func (h *Head) gc() {
953974
// Remove deleted series IDs from the postings lists.
954975
h.postings.Delete(deleted)
955976

977+
if h.wal != nil {
978+
_, last, _ := h.wal.Segments()
979+
h.deletedMtx.Lock()
980+
// Keep series records until we're past segment 'last'
981+
// because the WAL will still have samples records with
982+
// this ref ID. If we didn't keep these series records then
983+
// on start up when we replay the WAL, or any other code
984+
// that reads the WAL, wouldn't be able to use those
985+
// samples since we would have no labels for that ref ID.
986+
for ref := range deleted {
987+
h.deleted[ref] = last
988+
}
989+
h.deletedMtx.Unlock()
990+
}
991+
956992
// Rebuild symbols and label value indices from what is left in the postings terms.
957993
symbols := make(map[string]struct{}, len(h.symbols))
958994
values := make(map[string]stringset, len(h.values))

head_test.go

+51
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,57 @@ func TestDeleteUntilCurMax(t *testing.T) {
501501
testutil.Ok(t, err)
502502
testutil.Equals(t, []tsdbutil.Sample{sample{11, 1}}, ressmpls)
503503
}
504+
505+
func TestDeletedSamplesAndSeriesStillInWALAfterCheckpoint(t *testing.T) {
506+
dir, err := ioutil.TempDir("", "test_delete_wal")
507+
testutil.Ok(t, err)
508+
defer func() {
509+
testutil.Ok(t, os.RemoveAll(dir))
510+
}()
511+
wlog, err := wal.NewSize(nil, nil, dir, 32768)
512+
testutil.Ok(t, err)
513+
514+
// Enough samples to cause a checkpoint.
515+
numSamples := 10000
516+
hb, err := NewHead(nil, nil, wlog, int64(numSamples)*10)
517+
testutil.Ok(t, err)
518+
defer hb.Close()
519+
for i := 0; i < numSamples; i++ {
520+
app := hb.Appender()
521+
_, err := app.Add(labels.Labels{{"a", "b"}}, int64(i), 0)
522+
testutil.Ok(t, err)
523+
testutil.Ok(t, app.Commit())
524+
}
525+
testutil.Ok(t, hb.Delete(0, int64(numSamples), labels.NewEqualMatcher("a", "b")))
526+
testutil.Ok(t, hb.Truncate(1))
527+
testutil.Ok(t, hb.Close())
528+
529+
// Confirm there's been a checkpoint.
530+
cdir, _, err := LastCheckpoint(dir)
531+
testutil.Ok(t, err)
532+
// Read in checkpoint and WAL.
533+
recs := readTestWAL(t, cdir)
534+
recs = append(recs, readTestWAL(t, dir)...)
535+
536+
var series, samples, stones int
537+
for _, rec := range recs {
538+
switch rec.(type) {
539+
case []RefSeries:
540+
series++
541+
case []RefSample:
542+
samples++
543+
case []Stone:
544+
stones++
545+
default:
546+
t.Fatalf("unknown record type")
547+
}
548+
}
549+
testutil.Equals(t, 1, series)
550+
testutil.Equals(t, 9999, samples)
551+
testutil.Equals(t, 1, stones)
552+
553+
}
554+
504555
func TestDelete_e2e(t *testing.T) {
505556
numDatapoints := 1000
506557
numRanges := 1000

0 commit comments

Comments
 (0)