Skip to content

Commit fe41bc6

Browse files
Tiago NapoliCopilot
andcommitted
[RangeIndex] Add bf-tree recover-then-drop crash repros
Add three [Explicit] tests to RespRangeIndexTests reproducing the bf-tree 0.5.0 crash where dropping a CprSnapshot-recovered, MULTI-LEVEL tree aborts in bftree_drop with 'assertion failed: next_level.is_null()' at mini_page_op.rs:429 (panic on a bf-tree background thread, so --blame-crash cannot catch it): - RICheckpointRecoverThenDrop_Crashes: RESP-level via RI.CREATE DISK + RI.SET + SAVE + restart-recover; the recovered tree is dropped on server dispose. - BfTreeRecoverFromCprSnapshotThenDrop_Crashes: pure-native, calls a shared SeedRecoverDrop(records) helper with 378 records. - BfTreeRecoverFromCprSnapshotThenDrop_BelowThreshold_NoCrash: control with 377 records (one below the threshold) that drops cleanly. The crash is driven by the tree becoming multi-level (total data volume vs the small 64 KiB cache), not by record size. With tiny 4-byte key=value records, CACHESIZE 65536, MINRECORD 8, the threshold is a sharp 378 records (377 stays single-level), deterministic and identical on net8/net10 and Windows/Linux. Every Insert is asserted to succeed. All [Explicit] (crash the host by design; excluded from normal CI runs). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 237c328 commit fe41bc6

1 file changed

Lines changed: 133 additions & 0 deletions

File tree

test/standalone/Garnet.test.rangeindex/RespRangeIndexTests.cs

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44
using System;
55
using System.IO;
66
using System.Linq;
7+
using System.Text;
78
using System.Threading;
89
using System.Threading.Tasks;
910
using Garnet.server;
11+
using Garnet.server.BfTreeInterop;
1012
using NUnit.Framework;
1113
using NUnit.Framework.Legacy;
1214
using StackExchange.Redis;
@@ -1100,6 +1102,137 @@ public void RICheckpointAndRecoverTest()
11001102
}
11011103
}
11021104

1105+
/// <summary>
1106+
/// [Explicit] Repro: a DISK-backed range index with enough records to make the underlying
1107+
/// BfTree MULTI-LEVEL (1000 tiny 4-byte fields/values) is checkpointed (SAVE), recovered on
1108+
/// restart (RecoverFromCprSnapshot), then dropped when the server is disposed. Dropping the
1109+
/// recovered multi-level tree aborts the process in bf-tree's <c>bftree_drop</c>
1110+
/// (assertion failed: next_level.is_null() at mini_page_op.rs:429). The panic fires on a
1111+
/// bf-tree internal background thread (a native abort), so it cannot be caught by
1112+
/// <c>--blame-crash</c>/createdump. The crash is driven by total data volume vs the small
1113+
/// 64 KiB cache (multi-level split), not by record size — small trees recover and drop
1114+
/// cleanly (see <see cref="RICheckpointAndRecoverTest"/>). The pure-native form of this crash
1115+
/// is <see cref="BfTreeRecoverFromCprSnapshotThenDrop_Crashes"/>. Crashes the test
1116+
/// host by design.
1117+
/// </summary>
1118+
[Test]
1119+
[Explicit("Crashes by design: drops a CprSnapshot-recovered multi-level BfTree (bf-tree 0.5.0 bug).")]
1120+
public void RICheckpointRecoverThenDrop_Crashes()
1121+
{
1122+
server.Dispose();
1123+
TestUtils.DeleteDirectory(TestUtils.MethodTestDir, wait: true);
1124+
server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir, enableRangeIndexPreview: true, enableAOF: true);
1125+
server.Start();
1126+
1127+
const string index = "cpindex";
1128+
const int records = 1000; // well above the multi-level threshold for these tiny records
1129+
1130+
using (var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig(allowAdmin: true)))
1131+
{
1132+
var db = redis.GetDatabase(0);
1133+
1134+
db.Execute("RI.CREATE", index, "DISK", "CACHESIZE", "65536", "MINRECORD", "8");
1135+
for (var i = 0; i < records; i++)
1136+
db.Execute("RI.SET", index, i.ToString("D4"), i.ToString("D4")); // 4-byte field + value
1137+
1138+
// Sanity: data is present before checkpoint.
1139+
ClassicAssert.AreEqual("0000", (string)db.Execute("RI.GET", index, "0000"));
1140+
1141+
// Checkpoint: CprSnapshot of the (now multi-level) tree.
1142+
db.Execute("SAVE");
1143+
}
1144+
1145+
// Restart with recovery: the tree is reloaded via RecoverFromCprSnapshot.
1146+
server.Dispose();
1147+
server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir, enableRangeIndexPreview: true, enableAOF: true, tryRecover: true);
1148+
server.Start();
1149+
1150+
using (var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()))
1151+
{
1152+
var db = redis.GetDatabase(0);
1153+
ClassicAssert.AreEqual("0000", (string)db.Execute("RI.GET", index, "0000"),
1154+
"data should survive checkpoint + recovery");
1155+
}
1156+
1157+
// The recovered multi-level tree is now live. Disposing the server (here in TearDown via
1158+
// RangeIndexManager.Dispose) drops it and trips the bf-tree assert, aborting the process.
1159+
}
1160+
1161+
// Create a disk-backed BfTree, insert `records` tiny 4-byte key/value records, CprSnapshot it,
1162+
// then recover from that snapshot and drop the recovered tree. With these params (CACHESIZE
1163+
// 65536, MINRECORD 8, key=value=i.ToString("D4")) the tree becomes MULTI-LEVEL at exactly 378
1164+
// records: dropping a recovered multi-level tree aborts the process in bftree_drop
1165+
// (assertion failed: next_level.is_null() at mini_page_op.rs:429, on a bf-tree background
1166+
// thread), while 377 records stays single-level and drops cleanly. The crash is driven by
1167+
// total data volume vs the small cache (the multi-level split), not by record size. Every
1168+
// Insert is asserted to succeed so a silent InvalidKV (limit) failure cannot masquerade as a
1169+
// valid repro. The 377/378 boundary is deterministic and identical on net8/net10 and
1170+
// Windows/Linux (the structure is fixed by the native bf-tree binary + the inserted data).
1171+
private static void SeedRecoverDrop(int records)
1172+
{
1173+
// The crashing variant aborts the process mid-test, so its temp dir is never cleaned up
1174+
// afterward. Sweep any leftovers from prior (crashed) runs up front so they don't pile up.
1175+
foreach (var stale in Directory.EnumerateDirectories(Path.GetTempPath(), "bftree_cprrepro_*"))
1176+
{
1177+
try { Directory.Delete(stale, recursive: true); } catch { }
1178+
}
1179+
1180+
var dir = Path.Combine(Path.GetTempPath(), $"bftree_cprrepro_{Guid.NewGuid():N}");
1181+
Directory.CreateDirectory(dir);
1182+
1183+
var seed = new BfTreeService(StorageBackendType.Disk,
1184+
Path.Combine(dir, "seed.data.bftree"), Path.Combine(dir, "seed.scratch.cpr"),
1185+
cbSizeByte: 65536, cbMinRecordSize: 8);
1186+
for (var i = 0; i < records; i++)
1187+
{
1188+
var bytes = Encoding.ASCII.GetBytes(i.ToString("D4"));
1189+
ClassicAssert.AreEqual(BfTreeInsertResult.Success, seed.Insert(bytes, bytes), $"insert {i} should succeed");
1190+
}
1191+
seed.CprSnapshot();
1192+
var snapshot = Path.Combine(dir, "snap.bftree");
1193+
File.Copy(Path.Combine(dir, "seed.scratch.cpr"), snapshot, overwrite: false);
1194+
seed.Dispose();
1195+
1196+
// Recover from the snapshot and drop: aborts in bftree_drop iff the tree is multi-level.
1197+
var tree = BfTreeService.RecoverFromCprSnapshot(snapshot, Path.Combine(dir, "rec.scratch.cpr"), StorageBackendType.Disk);
1198+
tree.Dispose();
1199+
1200+
// Best-effort cleanup for the non-crashing (below-threshold) path; the crashing path
1201+
// aborts before reaching here and is swept by the next run's up-front cleanup above.
1202+
try { Directory.Delete(dir, recursive: true); } catch { }
1203+
}
1204+
1205+
/// <summary>
1206+
/// [Explicit] Pure-native form of <see cref="RICheckpointRecoverThenDrop_Crashes"/> calling
1207+
/// <see cref="BfTreeService"/> directly (no Garnet server). Inserting 378 tiny records makes
1208+
/// the tree MULTI-LEVEL; recovering it from a CprSnapshot and dropping it aborts the process
1209+
/// in <c>bftree_drop</c> (assertion failed: next_level.is_null() at mini_page_op.rs:429) on a
1210+
/// bf-tree background thread. 378 is exactly one above the threshold — see
1211+
/// <see cref="BfTreeRecoverFromCprSnapshotThenDrop_BelowThreshold_NoCrash"/>. Crashes the host
1212+
/// by design.
1213+
/// </summary>
1214+
[Test]
1215+
[Explicit("Crashes by design: 378 records (multi-level) recovered then dropped (bf-tree 0.5.0 bug).")]
1216+
public void BfTreeRecoverFromCprSnapshotThenDrop_Crashes()
1217+
{
1218+
SeedRecoverDrop(records: 378);
1219+
}
1220+
1221+
/// <summary>
1222+
/// Control for <see cref="BfTreeRecoverFromCprSnapshotThenDrop_Crashes"/>: the exact same
1223+
/// create → snapshot → recover → drop sequence with 377 records — one below the multi-level
1224+
/// split threshold — keeps the tree SINGLE-LEVEL, so the recovered tree drops cleanly and the
1225+
/// test passes. The only difference from the crashing test is the record count (378 vs 377),
1226+
/// demonstrating the sharp structural threshold.
1227+
/// </summary>
1228+
[Test]
1229+
[Explicit("Control: 377 records (one below the threshold, single-level) drops cleanly (no crash).")]
1230+
public void BfTreeRecoverFromCprSnapshotThenDrop_BelowThreshold_NoCrash()
1231+
{
1232+
SeedRecoverDrop(records: 377);
1233+
Assert.Pass("Below-threshold (single-level) recovered tree dropped without crashing.");
1234+
}
1235+
11031236
/// <summary>
11041237
/// Full lifecycle test: create → insert → flush to read-only → promote → mutate →
11051238
/// evict to disk → restore from flush → checkpoint → recover. Verifies data

0 commit comments

Comments
 (0)