diff --git a/Garnet.sln b/Garnet.sln
index 8d500f3704e..9f6ad55c83d 100644
--- a/Garnet.sln
+++ b/Garnet.sln
@@ -118,6 +118,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ETag", "samples\ETag\ETag.c
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Garnet.fuzz", "test\Garnet.fuzz\Garnet.fuzz.csproj", "{7A42F7AA-EE93-49B1-8711-A1D6D948F5FC}"
EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Btree", "playground\BTree\Btree.csproj", "{CE12831B-2805-469E-8208-759DC4B4862C}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -358,6 +360,14 @@ Global
{7A42F7AA-EE93-49B1-8711-A1D6D948F5FC}.Release|Any CPU.Build.0 = Release|Any CPU
{7A42F7AA-EE93-49B1-8711-A1D6D948F5FC}.Release|x64.ActiveCfg = Release|Any CPU
{7A42F7AA-EE93-49B1-8711-A1D6D948F5FC}.Release|x64.Build.0 = Release|Any CPU
+ {CE12831B-2805-469E-8208-759DC4B4862C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {CE12831B-2805-469E-8208-759DC4B4862C}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {CE12831B-2805-469E-8208-759DC4B4862C}.Debug|x64.ActiveCfg = Debug|Any CPU
+ {CE12831B-2805-469E-8208-759DC4B4862C}.Debug|x64.Build.0 = Debug|Any CPU
+ {CE12831B-2805-469E-8208-759DC4B4862C}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {CE12831B-2805-469E-8208-759DC4B4862C}.Release|Any CPU.Build.0 = Release|Any CPU
+ {CE12831B-2805-469E-8208-759DC4B4862C}.Release|x64.ActiveCfg = Release|Any CPU
+ {CE12831B-2805-469E-8208-759DC4B4862C}.Release|x64.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
@@ -395,6 +405,7 @@ Global
{4FBA1587-BAFC-49F8-803A-D1CF431A26F5} = {7068BB97-1958-4060-B5F1-859464592E56}
{7A42F7AA-EE93-49B1-8711-A1D6D948F5FC} = {9A03717A-4E0B-49CA-8579-A02A4C1D003F}
{1A5DF817-D0DD-4F0B-AE3A-C9CD0E03C9D5} = {D8A9CE6E-91B9-4B84-B44A-2BCF1161A793}
+ {CE12831B-2805-469E-8208-759DC4B4862C} = {69A71E2C-00E3-42F3-854E-BE157A24834E}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {2C02C405-4798-41CA-AF98-61EDFEF6772E}
diff --git a/libs/common/RespReadUtils.cs b/libs/common/RespReadUtils.cs
index 64c0b77e178..aba80473762 100644
--- a/libs/common/RespReadUtils.cs
+++ b/libs/common/RespReadUtils.cs
@@ -639,6 +639,16 @@ public static bool TryReadInt64WithLengthHeader(out long number, ref byte* ptr,
return true;
}
+ ///
+ /// Tries to read a Ulong from the given ASCII-encoded RESP string.
+ /// Note: this does not check for any length headers and is simply an accessor to TryReadUlong.
+ ///
+ /// If parsing was successful, contains the parsed ulong value.
+ /// The starting position in the RESP string. Will be advanced if parsing is successful.
+ /// The current end of the RESP string.
+ /// True if a ulong was successfully parsed.
+ public static bool ReadUlong(out ulong number, ref byte* ptr, byte* end) => TryReadUInt64(ref ptr, end, out number, out _);
+
///
/// Read long with length header
///
diff --git a/libs/host/Configuration/Options.cs b/libs/host/Configuration/Options.cs
index 1baf17c52d6..9d6ed7b0a8f 100644
--- a/libs/host/Configuration/Options.cs
+++ b/libs/host/Configuration/Options.cs
@@ -632,6 +632,10 @@ public IEnumerable LuaAllowedFunctions
[Option("expired-key-deletion-scan-freq", Required = false, HelpText = "Frequency of background scan for expired key deletion, in seconds")]
public int ExpiredKeyDeletionScanFrequencySecs { get; set; }
+ [OptionValidation]
+ [Option("streams", Required = false, HelpText = "Enable streams on server.")]
+ public bool? EnableStreams { get; set; }
+
///
/// This property contains all arguments that were not parsed by the command line argument parser
///
@@ -903,6 +907,7 @@ public GarnetServerOptions GetServerOptions(ILogger logger = null)
UnixSocketPermission = unixSocketPermissions,
MaxDatabases = MaxDatabases,
ExpiredKeyDeletionScanFrequencySecs = ExpiredKeyDeletionScanFrequencySecs,
+ EnableStreams = EnableStreams.GetValueOrDefault(),
};
}
diff --git a/libs/host/defaults.conf b/libs/host/defaults.conf
index 9b807eb6d76..6bb2f37716d 100644
--- a/libs/host/defaults.conf
+++ b/libs/host/defaults.conf
@@ -413,6 +413,9 @@
/* Max number of logical databases allowed in a single Garnet server instance */
"MaxDatabases": 16,
+ /* Enable use of streams inside Garnet */
+ "EnableStreams": false,
+
/* Frequency of background scan for expired key deletion, in seconds */
"ExpiredKeyDeletionScanFrequencySecs": -1
}
\ No newline at end of file
diff --git a/libs/resources/RespCommandsDocs.json b/libs/resources/RespCommandsDocs.json
index 196d8ac23ab..77c0adea68d 100644
--- a/libs/resources/RespCommandsDocs.json
+++ b/libs/resources/RespCommandsDocs.json
@@ -7693,6 +7693,290 @@
}
]
},
+ {
+ "Command": "XADD",
+ "Name": "XADD",
+ "Summary": "Appends a new message to a stream. Creates the key if it doesn\u0027t exist.",
+ "Group": "Stream",
+ "Complexity": "O(1) when adding a new entry, O(N) when trimming where N being the number of entries evicted.",
+ "Arguments": [
+ {
+ "TypeDiscriminator": "RespCommandKeyArgument",
+ "Name": "KEY",
+ "DisplayText": "key",
+ "Type": "Key",
+ "KeySpecIndex": 0
+ },
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "NOMKSTREAM",
+ "DisplayText": "nomkstream",
+ "Type": "PureToken",
+ "Token": "NOMKSTREAM",
+ "ArgumentFlags": "Optional"
+ },
+ {
+ "TypeDiscriminator": "RespCommandContainerArgument",
+ "Name": "TRIM",
+ "Type": "Block",
+ "ArgumentFlags": "Optional",
+ "Arguments": [
+ {
+ "TypeDiscriminator": "RespCommandContainerArgument",
+ "Name": "STRATEGY",
+ "Type": "OneOf",
+ "Arguments": [
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "MAXLEN",
+ "DisplayText": "maxlen",
+ "Type": "PureToken",
+ "Token": "MAXLEN"
+ },
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "MINID",
+ "DisplayText": "minid",
+ "Type": "PureToken",
+ "Token": "MINID"
+ }
+ ]
+ },
+ {
+ "TypeDiscriminator": "RespCommandContainerArgument",
+ "Name": "OPERATOR",
+ "Type": "OneOf",
+ "ArgumentFlags": "Optional",
+ "Arguments": [
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "EQUAL",
+ "DisplayText": "equal",
+ "Type": "PureToken",
+ "Token": "="
+ },
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "APPROXIMATELY",
+ "DisplayText": "approximately",
+ "Type": "PureToken",
+ "Token": "~"
+ }
+ ]
+ },
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "THRESHOLD",
+ "DisplayText": "threshold",
+ "Type": "String"
+ },
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "COUNT",
+ "DisplayText": "count",
+ "Type": "Integer",
+ "Token": "LIMIT",
+ "ArgumentFlags": "Optional"
+ }
+ ]
+ },
+ {
+ "TypeDiscriminator": "RespCommandContainerArgument",
+ "Name": "ID-SELECTOR",
+ "Type": "OneOf",
+ "Arguments": [
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "AUTO-ID",
+ "DisplayText": "auto-id",
+ "Type": "PureToken",
+ "Token": "*"
+ },
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "ID",
+ "DisplayText": "id",
+ "Type": "String"
+ }
+ ]
+ },
+ {
+ "TypeDiscriminator": "RespCommandContainerArgument",
+ "Name": "DATA",
+ "Type": "Block",
+ "ArgumentFlags": "Multiple",
+ "Arguments": [
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "FIELD",
+ "DisplayText": "field",
+ "Type": "String"
+ },
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "VALUE",
+ "DisplayText": "value",
+ "Type": "String"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "Command": "XDEL",
+ "Name": "XDEL",
+ "Summary": "Returns the number of messages after removing them from a stream.",
+ "Group": "Stream",
+ "Complexity": "O(1) for each single item to delete in the stream, regardless of the stream size.",
+ "Arguments": [
+ {
+ "TypeDiscriminator": "RespCommandKeyArgument",
+ "Name": "KEY",
+ "DisplayText": "key",
+ "Type": "Key",
+ "KeySpecIndex": 0
+ },
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "ID",
+ "DisplayText": "id",
+ "Type": "String",
+ "ArgumentFlags": "Multiple"
+ }
+ ]
+ },
+ {
+ "Command": "XLEN",
+ "Name": "XLEN",
+ "Summary": "Return the number of messages in a stream.",
+ "Group": "Stream",
+ "Complexity": "O(1)",
+ "Arguments": [
+ {
+ "TypeDiscriminator": "RespCommandKeyArgument",
+ "Name": "KEY",
+ "DisplayText": "key",
+ "Type": "Key",
+ "KeySpecIndex": 0
+ }
+ ]
+ },
+ {
+ "Command": "XRANGE",
+ "Name": "XRANGE",
+ "Summary": "Returns the messages from a stream within a range of IDs.",
+ "Group": "Stream",
+ "Complexity": "O(N) with N being the number of elements being returned. If N is constant (e.g. always asking for the first 10 elements with COUNT), you can consider it O(1).",
+ "Arguments": [
+ {
+ "TypeDiscriminator": "RespCommandKeyArgument",
+ "Name": "KEY",
+ "DisplayText": "key",
+ "Type": "Key",
+ "KeySpecIndex": 0
+ },
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "START",
+ "DisplayText": "start",
+ "Type": "String"
+ },
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "END",
+ "DisplayText": "end",
+ "Type": "String"
+ },
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "COUNT",
+ "DisplayText": "count",
+ "Type": "Integer",
+ "Token": "COUNT",
+ "ArgumentFlags": "Optional"
+ }
+ ]
+ },
+ {
+ "Command": "XTRIM",
+ "Name": "XTRIM",
+ "Summary": "Deletes messages from the beginning of a stream.",
+ "Group": "Stream",
+ "Complexity": "O(N), with N being the number of evicted entries. Constant times are very small however, since entries are organized in macro nodes containing multiple entries that can be released with a single deallocation.",
+ "Arguments": [
+ {
+ "TypeDiscriminator": "RespCommandKeyArgument",
+ "Name": "KEY",
+ "DisplayText": "key",
+ "Type": "Key",
+ "KeySpecIndex": 0
+ },
+ {
+ "TypeDiscriminator": "RespCommandContainerArgument",
+ "Name": "TRIM",
+ "Type": "Block",
+ "Arguments": [
+ {
+ "TypeDiscriminator": "RespCommandContainerArgument",
+ "Name": "STRATEGY",
+ "Type": "OneOf",
+ "Arguments": [
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "MAXLEN",
+ "DisplayText": "maxlen",
+ "Type": "PureToken",
+ "Token": "MAXLEN"
+ },
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "MINID",
+ "DisplayText": "minid",
+ "Type": "PureToken",
+ "Token": "MINID"
+ }
+ ]
+ },
+ {
+ "TypeDiscriminator": "RespCommandContainerArgument",
+ "Name": "OPERATOR",
+ "Type": "OneOf",
+ "ArgumentFlags": "Optional",
+ "Arguments": [
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "EQUAL",
+ "DisplayText": "equal",
+ "Type": "PureToken",
+ "Token": "="
+ },
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "APPROXIMATELY",
+ "DisplayText": "approximately",
+ "Type": "PureToken",
+ "Token": "~"
+ }
+ ]
+ },
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "THRESHOLD",
+ "DisplayText": "threshold",
+ "Type": "String"
+ },
+ {
+ "TypeDiscriminator": "RespCommandBasicArgument",
+ "Name": "COUNT",
+ "DisplayText": "count",
+ "Type": "Integer",
+ "Token": "LIMIT",
+ "ArgumentFlags": "Optional"
+ }
+ ]
+ }
+ ]
+ },
{
"Command": "ZADD",
"Name": "ZADD",
diff --git a/libs/resources/RespCommandsInfo.json b/libs/resources/RespCommandsInfo.json
index e6166c5ef48..2f613d67284 100644
--- a/libs/resources/RespCommandsInfo.json
+++ b/libs/resources/RespCommandsInfo.json
@@ -5026,14 +5026,17 @@
]
},
{
- "Command": "ZADD",
- "Name": "ZADD",
- "Arity": -4,
+ "Command": "XADD",
+ "Name": "XADD",
+ "Arity": -5,
"Flags": "DenyOom, Fast, Write",
"FirstKey": 1,
"LastKey": 1,
"Step": 1,
- "AclCategories": "Fast, SortedSet, Write",
+ "AclCategories": "Fast, Stream, Write",
+ "Tips": [
+ "nondeterministic_output"
+ ],
"KeySpecifications": [
{
"BeginSearch": {
@@ -5046,19 +5049,45 @@
"KeyStep": 1,
"Limit": 0
},
+ "Notes": "UPDATE instead of INSERT because of the optional trimming feature",
"Flags": "RW, Update"
}
]
},
{
- "Command": "ZCARD",
- "Name": "ZCARD",
+ "Command": "XDEL",
+ "Name": "XDEL",
+ "Arity": -3,
+ "Flags": "Fast, Write",
+ "FirstKey": 1,
+ "LastKey": 1,
+ "Step": 1,
+ "AclCategories": "Fast, Stream, Write",
+ "KeySpecifications": [
+ {
+ "BeginSearch": {
+ "TypeDiscriminator": "BeginSearchIndex",
+ "Index": 1
+ },
+ "FindKeys": {
+ "TypeDiscriminator": "FindKeysRange",
+ "LastKey": 0,
+ "KeyStep": 1,
+ "Limit": 0
+ },
+ "Flags": "RW, Delete"
+ }
+ ]
+ },
+ {
+ "Command": "XLEN",
+ "Name": "XLEN",
"Arity": 2,
"Flags": "Fast, ReadOnly",
"FirstKey": 1,
"LastKey": 1,
"Step": 1,
- "AclCategories": "Fast, Read, SortedSet",
+ "AclCategories": "Fast, Read, Stream",
"KeySpecifications": [
{
"BeginSearch": {
@@ -5076,9 +5105,87 @@
]
},
{
- "Command": "ZCOUNT",
- "Name": "ZCOUNT",
- "Arity": 4,
+ "Command": "XRANGE",
+ "Name": "XRANGE",
+ "Arity": -4,
+ "Flags": "ReadOnly",
+ "FirstKey": 1,
+ "LastKey": 1,
+ "Step": 1,
+ "AclCategories": "Read, Slow, Stream",
+ "KeySpecifications": [
+ {
+ "BeginSearch": {
+ "TypeDiscriminator": "BeginSearchIndex",
+ "Index": 1
+ },
+ "FindKeys": {
+ "TypeDiscriminator": "FindKeysRange",
+ "LastKey": 0,
+ "KeyStep": 1,
+ "Limit": 0
+ },
+ "Flags": "RO, Access"
+ }
+ ]
+ },
+ {
+ "Command": "XTRIM",
+ "Name": "XTRIM",
+ "Arity": -4,
+ "Flags": "Write",
+ "FirstKey": 1,
+ "LastKey": 1,
+ "Step": 1,
+ "AclCategories": "Slow, Stream, Write",
+ "Tips": [
+ "nondeterministic_output"
+ ],
+ "KeySpecifications": [
+ {
+ "BeginSearch": {
+ "TypeDiscriminator": "BeginSearchIndex",
+ "Index": 1
+ },
+ "FindKeys": {
+ "TypeDiscriminator": "FindKeysRange",
+ "LastKey": 0,
+ "KeyStep": 1,
+ "Limit": 0
+ },
+ "Flags": "RW, Delete"
+ }
+ ]
+ },
+ {
+ "Command": "ZADD",
+ "Name": "ZADD",
+ "Arity": -4,
+ "Flags": "DenyOom, Fast, Write",
+ "FirstKey": 1,
+ "LastKey": 1,
+ "Step": 1,
+ "AclCategories": "Fast, SortedSet, Write",
+ "KeySpecifications": [
+ {
+ "BeginSearch": {
+ "TypeDiscriminator": "BeginSearchIndex",
+ "Index": 1
+ },
+ "FindKeys": {
+ "TypeDiscriminator": "FindKeysRange",
+ "LastKey": 0,
+ "KeyStep": 1,
+ "Limit": 0
+ },
+ "Flags": "RW, Update"
+ }
+ ]
+ },
+ {
+ "Command": "ZCARD",
+ "Name": "ZCARD",
+ "Arity": 2,
"Flags": "Fast, ReadOnly",
"FirstKey": 1,
"LastKey": 1,
@@ -5096,7 +5203,7 @@
"KeyStep": 1,
"Limit": 0
},
- "Flags": "RO, Access"
+ "Flags": "RO"
}
]
},
@@ -5108,7 +5215,7 @@
"FirstKey": 1,
"LastKey": 1,
"Step": 1,
- "AclCategories": "SortedSet, Write, Admin, Garnet",
+ "AclCategories": "Admin, SortedSet, Write, Garnet",
"KeySpecifications": [
{
"BeginSearch": {
@@ -5125,6 +5232,31 @@
}
]
},
+ {
+ "Command": "ZCOUNT",
+ "Name": "ZCOUNT",
+ "Arity": 4,
+ "Flags": "Fast, ReadOnly",
+ "FirstKey": 1,
+ "LastKey": 1,
+ "Step": 1,
+ "AclCategories": "Fast, Read, SortedSet",
+ "KeySpecifications": [
+ {
+ "BeginSearch": {
+ "TypeDiscriminator": "BeginSearchIndex",
+ "Index": 1
+ },
+ "FindKeys": {
+ "TypeDiscriminator": "FindKeysRange",
+ "LastKey": 0,
+ "KeyStep": 1,
+ "Limit": 0
+ },
+ "Flags": "RO, Access"
+ }
+ ]
+ },
{
"Command": "ZDIFF",
"Name": "ZDIFF",
@@ -5193,7 +5325,7 @@
"FirstKey": 1,
"LastKey": 1,
"Step": 1,
- "AclCategories": "SortedSet, Fast, Write, Garnet",
+ "AclCategories": "Fast, SortedSet, Write, Garnet",
"KeySpecifications": [
{
"BeginSearch": {
@@ -5218,7 +5350,7 @@
"FirstKey": 1,
"LastKey": 1,
"Step": 1,
- "AclCategories": "SortedSet, Fast, Write, Garnet",
+ "AclCategories": "Fast, SortedSet, Write, Garnet",
"KeySpecifications": [
{
"BeginSearch": {
@@ -5243,7 +5375,7 @@
"FirstKey": 1,
"LastKey": 1,
"Step": 1,
- "AclCategories": "SortedSet, Fast, Read, Garnet",
+ "AclCategories": "Fast, Read, SortedSet, Garnet",
"KeySpecifications": [
{
"BeginSearch": {
@@ -5447,7 +5579,7 @@
"FirstKey": 1,
"LastKey": 1,
"Step": 1,
- "AclCategories": "SortedSet, Fast, Write, Garnet",
+ "AclCategories": "Fast, SortedSet, Write, Garnet",
"KeySpecifications": [
{
"BeginSearch": {
@@ -5472,7 +5604,7 @@
"FirstKey": 1,
"LastKey": 1,
"Step": 1,
- "AclCategories": "SortedSet, Fast, Write, Garnet",
+ "AclCategories": "Fast, SortedSet, Write, Garnet",
"KeySpecifications": [
{
"BeginSearch": {
@@ -5497,7 +5629,7 @@
"FirstKey": 1,
"LastKey": 1,
"Step": 1,
- "AclCategories": "SortedSet, Fast, Write, Garnet",
+ "AclCategories": "Fast, SortedSet, Write, Garnet",
"KeySpecifications": [
{
"BeginSearch": {
@@ -5522,7 +5654,7 @@
"FirstKey": 1,
"LastKey": 1,
"Step": 1,
- "AclCategories": "SortedSet, Fast, Read, Garnet",
+ "AclCategories": "Fast, Read, SortedSet, Garnet",
"KeySpecifications": [
{
"BeginSearch": {
@@ -5540,14 +5672,14 @@
]
},
{
- "Command": "ZPTTL",
- "Name": "ZPTTL",
- "Arity": -5,
- "Flags": "Fast, ReadOnly",
+ "Command": "ZPOPMAX",
+ "Name": "ZPOPMAX",
+ "Arity": -2,
+ "Flags": "Fast, Write",
"FirstKey": 1,
"LastKey": 1,
"Step": 1,
- "AclCategories": "SortedSet, Fast, Read, Garnet",
+ "AclCategories": "Fast, SortedSet, Write",
"KeySpecifications": [
{
"BeginSearch": {
@@ -5560,13 +5692,13 @@
"KeyStep": 1,
"Limit": 0
},
- "Flags": "RO, Access"
+ "Flags": "RW, Access, Delete"
}
]
},
{
- "Command": "ZPOPMAX",
- "Name": "ZPOPMAX",
+ "Command": "ZPOPMIN",
+ "Name": "ZPOPMIN",
"Arity": -2,
"Flags": "Fast, Write",
"FirstKey": 1,
@@ -5590,14 +5722,14 @@
]
},
{
- "Command": "ZPOPMIN",
- "Name": "ZPOPMIN",
- "Arity": -2,
- "Flags": "Fast, Write",
+ "Command": "ZPTTL",
+ "Name": "ZPTTL",
+ "Arity": -5,
+ "Flags": "Fast, ReadOnly",
"FirstKey": 1,
"LastKey": 1,
"Step": 1,
- "AclCategories": "Fast, SortedSet, Write",
+ "AclCategories": "Fast, Read, SortedSet, Garnet",
"KeySpecifications": [
{
"BeginSearch": {
@@ -5610,7 +5742,7 @@
"KeyStep": 1,
"Limit": 0
},
- "Flags": "RW, Access, Delete"
+ "Flags": "RO, Access"
}
]
},
@@ -6009,14 +6141,14 @@
]
},
{
- "Command": "ZTTL",
- "Name": "ZTTL",
- "Arity": -5,
+ "Command": "ZSCORE",
+ "Name": "ZSCORE",
+ "Arity": 3,
"Flags": "Fast, ReadOnly",
"FirstKey": 1,
"LastKey": 1,
"Step": 1,
- "AclCategories": "SortedSet, Fast, Read, Garnet",
+ "AclCategories": "Fast, Read, SortedSet",
"KeySpecifications": [
{
"BeginSearch": {
@@ -6034,14 +6166,14 @@
]
},
{
- "Command": "ZSCORE",
- "Name": "ZSCORE",
- "Arity": 3,
+ "Command": "ZTTL",
+ "Name": "ZTTL",
+ "Arity": -5,
"Flags": "Fast, ReadOnly",
"FirstKey": 1,
"LastKey": 1,
"Step": 1,
- "AclCategories": "Fast, Read, SortedSet",
+ "AclCategories": "Fast, Read, SortedSet, Garnet",
"KeySpecifications": [
{
"BeginSearch": {
diff --git a/libs/server/BTreeIndex/BTree.cs b/libs/server/BTreeIndex/BTree.cs
new file mode 100644
index 00000000000..2f60e392e60
--- /dev/null
+++ b/libs/server/BTreeIndex/BTree.cs
@@ -0,0 +1,191 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System;
+using System.Collections.Generic;
+using System.Runtime.InteropServices;
+
+namespace Garnet.server.BTreeIndex
+{
+ public unsafe partial class BTree
+ {
+ BTreeNode* root;
+ BTreeNode* head;
+ BTreeNode* tail;
+ byte* tailMinKey;
+ public static readonly int MAX_TREE_DEPTH = 10; // maximum allowed depth of the tree
+ static int DEFAULT_SPLIT_LEAF_POSITION = (BTreeNode.LEAF_CAPACITY + 1) / 2; // position at which leaf node is split
+ static int SPLIT_LEAF_POSITION = BTreeNode.LEAF_CAPACITY; // position at which leaf node is split
+ static int SPLIT_INTERNAL_POSITION = BTreeNode.INTERNAL_CAPACITY; // position at which internal node is split
+
+ BTreeNode*[] rootToTailLeaf; // array of nodes from root to tail leaf
+ public BTreeStats stats; // statistics about the tree
+
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ public BTree(uint sectorSize)
+ {
+ var memoryBlock = (IntPtr*)NativeMemory.AlignedAlloc((nuint)BTreeNode.PAGE_SIZE, (nuint)BTreeNode.PAGE_SIZE);
+ root = BTreeNode.Create(BTreeNodeType.Leaf, memoryBlock);
+ head = tail = root;
+ root->info->next = root->info->previous = null;
+ root->info->count = 0;
+ tailMinKey = null;
+ rootToTailLeaf = new BTreeNode*[MAX_TREE_DEPTH];
+ stats = new BTreeStats();
+ stats.depth = 1;
+ stats.numLeafNodes = 1;
+ stats.numAllocates = 1;
+ }
+
+ ///
+ /// Frees the memory allocated for a node
+ ///
+ /// BTreeNode to free from memory
+ private void Free(ref BTreeNode* node)
+ {
+ if (node == null)
+ return;
+
+ // If this is an internal node, free all its children first
+ if (node->info->type == BTreeNodeType.Internal)
+ {
+ for (int i = 0; i <= node->info->count; i++)
+ {
+ var child = node->data.children[i];
+ Free(ref child);
+ node->data.children[i] = null;
+ }
+ }
+
+ // Free the memory handle
+ if (node->memoryHandle != null)
+ {
+ NativeMemory.Free(node->memoryHandle);
+ stats.numDeallocates++;
+ node = null;
+ }
+ }
+
+ ///
+ /// Frees the memory allocated for a node
+ ///
+ ///
+ public static void FreeNode(ref BTreeNode* node)
+ {
+ if (node == null)
+ return;
+
+ // If this is an internal node, free all its children first
+ if (node->info->type == BTreeNodeType.Internal)
+ {
+ for (int i = 0; i <= node->info->count; i++)
+ {
+ var child = node->data.children[i];
+ FreeNode(ref child);
+ node->data.children[i] = null;
+ }
+ }
+
+ // Free the memory handle
+ if (node->memoryHandle != null)
+ {
+ NativeMemory.Free(node->memoryHandle);
+ node = null;
+ }
+ }
+
+ public static void Deallocate(ref BTreeNode* node)
+ {
+ // Free the memory handle
+ if (node->memoryHandle != null)
+ {
+ NativeMemory.Free(node->memoryHandle);
+ node->info = null;
+ node->keys = null;
+ node->data.values = null;
+ node->data.children = null;
+ node->memoryHandle = null;
+ }
+ }
+
+ ///
+ /// Deallocates the memory allocated for the B+Tree
+ ///
+ public void Deallocate()
+ {
+ if (root == null)
+ return;
+ Free(ref root);
+ Console.WriteLine("free complete");
+ stats.printStats();
+ root = null;
+ head = null;
+ tail = null;
+ }
+
+ ///
+ /// Destructor for the B+tree
+ ///
+ ~BTree()
+ {
+ Deallocate();
+ }
+
+ public ulong FastInserts => stats.totalFastInserts;
+ public ulong LeafCount => stats.numLeafNodes;
+ public ulong InternalCount => stats.numInternalNodes;
+
+ public ulong ValidCount => StatsValidCount();
+
+ public long RootValidCount => GetValidCount(root);
+
+ public long TailValidCount => GetValidCount(tail);
+
+ public long Count()
+ {
+ return stats.numKeys;
+ }
+ public ulong StatsValidCount()
+ {
+ return stats.numValidKeys;
+ }
+
+ public long GetValidCount(BTreeNode* node)
+ {
+ return node->info->validCount;
+ }
+
+ ///
+ /// Retrieves the first entry in the B+Tree (smallest key)
+ ///
+ /// entry fetched
+ public KeyValuePair First()
+ {
+ BTreeNode* leaf = head;
+ if (leaf == null)
+ {
+ return default;
+ }
+ byte[] keyBytes = new ReadOnlySpan(leaf->GetKey(0), BTreeNode.KEY_SIZE).ToArray();
+ return new KeyValuePair(keyBytes, leaf->GetValue(0));
+ }
+
+ ///
+ /// Retrieves the last entry in the B+Tree (largest key)
+ ///
+ /// entry fetched
+ public KeyValuePair Last()
+ {
+ BTreeNode* leaf = tail;
+ if (leaf == null)
+ {
+ return default;
+ }
+ byte[] keyBytes = new ReadOnlySpan(leaf->GetKey(leaf->info->count - 1), BTreeNode.KEY_SIZE).ToArray();
+ return new KeyValuePair(keyBytes, leaf->GetValue(leaf->info->count - 1));
+ }
+
+ }
+}
\ No newline at end of file
diff --git a/libs/server/BTreeIndex/BTreeDelete.cs b/libs/server/BTreeIndex/BTreeDelete.cs
new file mode 100644
index 00000000000..07097b04831
--- /dev/null
+++ b/libs/server/BTreeIndex/BTreeDelete.cs
@@ -0,0 +1,32 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+namespace Garnet.server.BTreeIndex
+{
+ public unsafe partial class BTree
+ {
+ ///
+ /// Delete a key from the B+tree
+ ///
+ /// key to delete
+ /// true if key was tombstoned
+ public bool Delete(byte* key)
+ {
+ BTreeNode* leaf = null;
+ var nodesTraversed = new BTreeNode*[MAX_TREE_DEPTH];
+
+ TraverseToLeaf(ref leaf, ref nodesTraversed, key);
+ var index = leaf->LowerBound(key);
+ if (index >= leaf->info->count || BTreeNode.Compare(key, leaf->GetKey(index)) != 0)
+ {
+ return false;
+ }
+
+ // insert a tombstone for the delete
+ leaf->InsertTombstone(index);
+ leaf->info->validCount--;
+ stats.numValidKeys--;
+ return true;
+ }
+ }
+}
\ No newline at end of file
diff --git a/libs/server/BTreeIndex/BTreeInsert.cs b/libs/server/BTreeIndex/BTreeInsert.cs
new file mode 100644
index 00000000000..d9073dbd930
--- /dev/null
+++ b/libs/server/BTreeIndex/BTreeInsert.cs
@@ -0,0 +1,344 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System;
+using System.Runtime.InteropServices;
+
+namespace Garnet.server.BTreeIndex
+{
+ public unsafe partial class BTree
+ {
+ ///
+ /// Insert a key-value pair into the B+tree. Directly inserts into the tail leaf node.
+ ///
+ ///
+ ///
+ /// true if insertion is successful
+ public bool Insert(byte* key, Value value)
+ {
+ BTreeNode* leaf = null;
+ stats.totalFastInserts++;
+ stats.totalInserts++;
+ stats.numKeys++;
+ stats.numValidKeys++;
+ leaf = tail;
+ return InsertToLeafNode(ref leaf, ref rootToTailLeaf, key, value, true);
+ }
+
+ public bool Insert(byte* key, ReadOnlySpan keySpan, Value value)
+ {
+ BTreeNode* leaf = null;
+ stats.totalFastInserts++;
+ stats.totalInserts++;
+ stats.numKeys++;
+ stats.numValidKeys++;
+ leaf = tail;
+ return InsertToLeafNode(ref leaf, ref rootToTailLeaf, key, value, true);
+ }
+ public bool InsertToLeafNode(ref BTreeNode* leaf, ref BTreeNode*[] nodesTraversed, byte* key, Value value, bool appendToLeaf = false)
+ {
+ int index;
+ if (appendToLeaf)
+ {
+ // if leaf has space
+ if (leaf->info->count < BTreeNode.LEAF_CAPACITY)
+ {
+ // append to end of leaf node
+ leaf->SetKey(leaf->info->count, key);
+ leaf->SetValue(leaf->info->count, value);
+ leaf->info->count++;
+ leaf->info->validCount++;
+ return true;
+ }
+ index = leaf->info->count;
+ return SplitLeafNode(ref leaf, ref nodesTraversed, key, value, index);
+ }
+
+ // find the index where the key should be inserted
+ index = leaf->LowerBound(key);
+ if (index < leaf->info->count && BTreeNode.Compare(key, leaf->GetKey(index)) == 0)
+ {
+ // insert is actually an update
+ leaf->SetValue(index, value);
+ return false;
+ }
+
+ if (leaf->info->count < BTreeNode.LEAF_CAPACITY)
+ {
+ // move keys to the right of index
+ var sourceSpan = new ReadOnlySpan(leaf->keys + index * BTreeNode.KEY_SIZE, (leaf->info->count - index) * BTreeNode.KEY_SIZE);
+ var destinationSpan = new Span(leaf->keys + ((index + 1) * BTreeNode.KEY_SIZE), (leaf->info->count - index) * BTreeNode.KEY_SIZE);
+ sourceSpan.CopyTo(destinationSpan);
+
+ leaf->SetKey(index, key);
+ leaf->SetValue(index, value);
+ leaf->info->count++;
+ leaf->info->validCount++;
+ return true;
+ }
+ return SplitLeafNode(ref leaf, ref nodesTraversed, key, value, index);
+ }
+
+ public bool SplitLeafNode(ref BTreeNode* leaf, ref BTreeNode*[] nodesTraversed, byte* key, Value value, int index)
+ {
+ var memoryBlock = (IntPtr*)NativeMemory.AlignedAlloc((nuint)BTreeNode.PAGE_SIZE, (nuint)BTreeNode.PAGE_SIZE);
+ stats.numAllocates++;
+ BTreeNode* newLeaf = BTreeNode.Create(BTreeNodeType.Leaf, memoryBlock);
+
+ leaf->info->count = SPLIT_LEAF_POSITION;
+ newLeaf->info->previous = leaf;
+ newLeaf->info->next = leaf->info->next;
+ newLeaf->info->count = BTreeNode.LEAF_CAPACITY + 1 - SPLIT_LEAF_POSITION;
+ leaf->info->next = newLeaf;
+ stats.numLeafNodes++;
+
+ // scan the keys from splitLeafPos to get the number of valid keys in the new leaf
+ uint newLeafValidCount = 0;
+ for (var i = SPLIT_LEAF_POSITION; i < BTreeNode.LEAF_CAPACITY; i++)
+ {
+ if (leaf->data.values[i].Valid)
+ {
+ newLeafValidCount++;
+ }
+ }
+ leaf->info->validCount -= newLeafValidCount;
+ newLeaf->info->validCount = newLeafValidCount;
+ // insert the new key to either the old node or the newly created node, based on the index
+ if (index >= leaf->info->count)
+ {
+ // new key goes to the new leaf
+ var newIndex = index - leaf->info->count;
+
+ // move the keys from old node to the new node using ReadOnlySpan
+ var sourceSpan = new ReadOnlySpan(leaf->keys + index * BTreeNode.KEY_SIZE, newIndex * BTreeNode.KEY_SIZE);
+ var destinationSpan = new Span(newLeaf->keys, newIndex * BTreeNode.KEY_SIZE);
+ sourceSpan.CopyTo(destinationSpan);
+
+ // add key to new leaf
+ newLeaf->SetKey(newIndex, key);
+
+ var existingLeafKeysSpan = new ReadOnlySpan(leaf->keys + index * BTreeNode.KEY_SIZE, (BTreeNode.LEAF_CAPACITY - index) * BTreeNode.KEY_SIZE);
+ var newLeafKeysSpan = new Span(newLeaf->keys + (newIndex + 1) * BTreeNode.KEY_SIZE, (BTreeNode.LEAF_CAPACITY - index) * BTreeNode.KEY_SIZE);
+ existingLeafKeysSpan.CopyTo(newLeafKeysSpan);
+
+ var existingLeafValuesSpan = new ReadOnlySpan(leaf->data.values + leaf->info->count, newIndex * sizeof(Value));
+ var newLeafValuesSpan = new Span(newLeaf->data.values, newIndex * sizeof(Value));
+ existingLeafValuesSpan.CopyTo(newLeafValuesSpan);
+ newLeaf->SetValue(newIndex, value);
+
+ var existingLeafValuesSpan2 = new ReadOnlySpan(leaf->data.values + index, (BTreeNode.LEAF_CAPACITY - index) * sizeof(Value));
+ var newLeafValuesSpan2 = new Span(newLeaf->data.values + newIndex + 1, (BTreeNode.LEAF_CAPACITY - index) * sizeof(Value));
+ existingLeafValuesSpan2.CopyTo(newLeafValuesSpan2);
+ newLeaf->info->validCount++;
+ }
+ else
+ {
+ var existingLeafKeysSpan = new ReadOnlySpan(leaf->keys + (leaf->info->count - 1) * BTreeNode.KEY_SIZE, newLeaf->info->count * BTreeNode.KEY_SIZE);
+ var newLeafKeysSpan = new Span(newLeaf->keys, newLeaf->info->count * BTreeNode.KEY_SIZE);
+ existingLeafKeysSpan.CopyTo(newLeafKeysSpan);
+
+ var existingLeafKeysSpan2 = new ReadOnlySpan(leaf->keys + index * BTreeNode.KEY_SIZE, (leaf->info->count - index - 1) * BTreeNode.KEY_SIZE);
+ var newLeafKeysSpan2 = new Span(leaf->keys + ((index + 1) * BTreeNode.KEY_SIZE), (leaf->info->count - index - 1) * BTreeNode.KEY_SIZE);
+ existingLeafKeysSpan2.CopyTo(newLeafKeysSpan2);
+ leaf->SetKey(index, key);
+
+ var existingLeafValuesSpan = new ReadOnlySpan(leaf->data.values + leaf->info->count - 1, newLeaf->info->count * sizeof(Value));
+ var newLeafValuesSpan = new Span(newLeaf->data.values, newLeaf->info->count * sizeof(Value));
+ existingLeafValuesSpan.CopyTo(newLeafValuesSpan);
+
+ var existingLeafValuesSpan2 = new ReadOnlySpan(leaf->data.values + index, (leaf->info->count - index - 1) * sizeof(Value));
+ var newLeafValuesSpan2 = new Span(leaf->data.values + index + 1, (leaf->info->count - index - 1) * sizeof(Value));
+ existingLeafValuesSpan2.CopyTo(newLeafValuesSpan2);
+ leaf->SetValue(index, value);
+ leaf->info->validCount++;
+ }
+
+ uint validCount = 0;
+ // the leaf that is split will also be the tail node; so update the tail pointer
+ if (leaf == tail)
+ {
+ tail = newLeaf;
+ tailMinKey = newLeaf->GetKey(0);
+ rootToTailLeaf[0] = newLeaf;
+ // validCount in internal nodes of the index excludes the validCount of the tail leaf node (optimizing for performance to avoid traversal)
+ // thus, when we split the tail leaf, we push up the validCount of the leaf that we split to the internal node
+ validCount = leaf->info->validCount;
+ }
+
+ // update the parent node with the new key
+ PushUpKeyInInternalNode(ref nodesTraversed, newLeaf->GetKey(0), ref newLeaf, SPLIT_INTERNAL_POSITION, validCount);
+ return true;
+ }
+
+ public void PushUpKeyInInternalNode(ref BTreeNode*[] nodesTraversed, byte* key, ref BTreeNode* child, int splitPos, uint newValidCount)
+ {
+ int i;
+ // starts from parent of leaf node that triggered the push-up.
+ // if the parent has space, insert the key and child pointer, and return. Otherwise, split and cascade up.
+ for (i = 1; i < stats.depth; i++)
+ {
+ var node = nodesTraversed[i];
+ var index = node->UpperBound(key);
+
+ if (node->info->count < BTreeNode.INTERNAL_CAPACITY)
+ {
+ // we can insert
+ InsertToInternalNodeWithinCapacity(ref node, key, ref child, ref nodesTraversed, index, newValidCount);
+
+ // update validCounts in the parent nodes
+ for (var j = i + 1; j < stats.depth; j++)
+ {
+ nodesTraversed[j]->info->validCount += newValidCount;
+ }
+ return;
+ }
+
+ // split internal node
+ node->info->validCount += newValidCount;
+ var newNode = SplitInternalNode(ref node, ref nodesTraversed, ref key, ref child, splitPos, index, i);
+ if (rootToTailLeaf[i] == node && tail != head && BTreeNode.Compare(key, tailMinKey) <= 0)
+ {
+ rootToTailLeaf[i] = newNode;
+ }
+ child = newNode;
+ }
+ // split root
+ CreateNewRoot(key, child);
+ }
+
+ public void InsertToInternalNodeWithinCapacity(ref BTreeNode* node, byte* key, ref BTreeNode* child, ref BTreeNode*[] nodesTraversed, int index, uint newValidCount)
+ {
+ // move all keys to the right
+ var sourceSpan = new ReadOnlySpan(node->keys + index * BTreeNode.KEY_SIZE, (node->info->count - index) * BTreeNode.KEY_SIZE);
+ var destinationSpan = new Span(node->keys + ((index + 1) * BTreeNode.KEY_SIZE), (node->info->count - index) * BTreeNode.KEY_SIZE);
+ sourceSpan.CopyTo(destinationSpan);
+
+ // move all children starting from index+1 to the right using a for loop
+ for (var j = node->info->count; j > index; j--)
+ {
+ node->SetChild(j + 1, node->GetChild(j));
+ }
+
+ // insert
+ node->SetKey(index, key);
+ node->SetChild(index + 1, child);
+ node->info->count++;
+ node->info->validCount += newValidCount;
+ }
+
+ public BTreeNode* CreateInternalNode(ref BTreeNode* node, int splitPos)
+ {
+ var memoryBlock = (IntPtr*)NativeMemory.AlignedAlloc((nuint)BTreeNode.PAGE_SIZE, (nuint)BTreeNode.PAGE_SIZE);
+ stats.numAllocates++;
+ BTreeNode* newNode = BTreeNode.Create(BTreeNodeType.Internal, memoryBlock);
+ stats.numInternalNodes++;
+ node->info->count = splitPos;
+ newNode->info->count = BTreeNode.INTERNAL_CAPACITY - splitPos;
+ newNode->info->next = node->info->next;
+ newNode->info->previous = node;
+ node->info->next = newNode;
+ return newNode;
+ }
+
+ public BTreeNode* SplitInternalNode(ref BTreeNode* nodeToSplit, ref BTreeNode*[] nodesTraversed, ref byte* key, ref BTreeNode* child, int splitPos, int index, int level)
+ {
+ var newNode = CreateInternalNode(ref nodeToSplit, splitPos);
+
+ // scan keys from splitPos to get number of valid keys in the new node
+ uint newValidCount = 0;
+ for (int i = splitPos; i < BTreeNode.INTERNAL_CAPACITY; i++)
+ {
+ if (nodeToSplit->GetChild(i) != null)
+ {
+ newValidCount += nodeToSplit->GetChild(i)->info->validCount;
+ }
+ }
+ newNode->info->validCount = newValidCount;
+
+ if (index > nodeToSplit->info->count)
+ {
+ // child goes to newNode
+ var sourceSpan = new ReadOnlySpan(nodeToSplit->keys + (nodeToSplit->info->count + 1) * BTreeNode.KEY_SIZE, (index - nodeToSplit->info->count - 1) * BTreeNode.KEY_SIZE);
+ var destinationSpan = new Span(newNode->keys, (index - nodeToSplit->info->count - 1) * BTreeNode.KEY_SIZE);
+ sourceSpan.CopyTo(destinationSpan);
+
+ var existingNodeKeysSpan = new ReadOnlySpan(nodeToSplit->keys + index * BTreeNode.KEY_SIZE, (BTreeNode.INTERNAL_CAPACITY - index) * BTreeNode.KEY_SIZE);
+ var newNodeKeysSpan = new Span(newNode->keys + (index - nodeToSplit->info->count) * BTreeNode.KEY_SIZE, (BTreeNode.INTERNAL_CAPACITY - index) * BTreeNode.KEY_SIZE);
+ existingNodeKeysSpan.CopyTo(newNodeKeysSpan);
+ newNode->SetKey(index - nodeToSplit->info->count - 1, key);
+
+ var existingNodeChildrenSpan = new ReadOnlySpan(nodeToSplit->data.children + 1 + nodeToSplit->info->count, (index - nodeToSplit->info->count) * sizeof(BTreeNode*));
+ var newNodeChildrenSpan = new Span(newNode->data.children, (index - nodeToSplit->info->count) * sizeof(BTreeNode*));
+ existingNodeChildrenSpan.CopyTo(newNodeChildrenSpan);
+
+ var existingNodeChildrenSpan2 = new ReadOnlySpan(nodeToSplit->data.children + 1 + index, newNode->info->count * sizeof(BTreeNode*));
+ var newNodeChildrenSpan2 = new Span(newNode->data.children + 1 + index - nodeToSplit->info->count, newNode->info->count * sizeof(BTreeNode*));
+ existingNodeChildrenSpan2.CopyTo(newNodeChildrenSpan2);
+ newNode->SetChild(index - nodeToSplit->info->count, child);
+ key = nodeToSplit->GetKey(nodeToSplit->info->count);
+ }
+ else if (index == nodeToSplit->info->count)
+ {
+ var sourceSpan = new ReadOnlySpan(nodeToSplit->keys + nodeToSplit->info->count * BTreeNode.KEY_SIZE, newNode->info->count * BTreeNode.KEY_SIZE);
+ var destinationSpan = new Span(newNode->keys, newNode->info->count * BTreeNode.KEY_SIZE);
+ sourceSpan.CopyTo(destinationSpan);
+
+ var existingNodeChildrenSpan = new ReadOnlySpan(nodeToSplit->data.children + 1 + nodeToSplit->info->count, newNode->info->count * sizeof(BTreeNode*));
+ var newNodeChildrenSpan = new Span(newNode->data.children + 1, newNode->info->count * sizeof(BTreeNode*));
+ existingNodeChildrenSpan.CopyTo(newNodeChildrenSpan);
+ newNode->SetChild(0, child);
+ }
+ else
+ {
+ // child goes to old node
+ var sourceSpan = new ReadOnlySpan(nodeToSplit->keys + nodeToSplit->info->count * BTreeNode.KEY_SIZE, newNode->info->count * BTreeNode.KEY_SIZE);
+ var destinationSpan = new Span(newNode->keys, newNode->info->count * BTreeNode.KEY_SIZE);
+ sourceSpan.CopyTo(destinationSpan);
+
+ var existingNodeKeysSpan = new ReadOnlySpan(nodeToSplit->keys + index * BTreeNode.KEY_SIZE, (nodeToSplit->info->count - index) * BTreeNode.KEY_SIZE);
+ var newNodeKeysSpan = new Span(nodeToSplit->keys + ((index + 1) * BTreeNode.KEY_SIZE), (nodeToSplit->info->count - index) * BTreeNode.KEY_SIZE);
+ existingNodeKeysSpan.CopyTo(newNodeKeysSpan);
+ nodeToSplit->SetKey(index, key);
+
+ var existingNodeChildrenSpan = new ReadOnlySpan(nodeToSplit->data.children + nodeToSplit->info->count, newNode->info->count * sizeof(BTreeNode*));
+ var newNodeChildrenSpan = new Span(newNode->data.children, newNode->info->count * sizeof(BTreeNode*));
+ existingNodeChildrenSpan.CopyTo(newNodeChildrenSpan);
+
+ var existingNodeChildrenSpan2 = new ReadOnlySpan(nodeToSplit->data.children + index + 1, (nodeToSplit->info->count - index + 1) * sizeof(BTreeNode*));
+ var newNodeChildrenSpan2 = new Span(nodeToSplit->data.children + index + 2, (nodeToSplit->info->count - index + 1) * sizeof(BTreeNode*));
+ existingNodeChildrenSpan2.CopyTo(newNodeChildrenSpan2);
+ nodeToSplit->SetChild(index + 1, child);
+ key = nodeToSplit->GetKey(nodeToSplit->info->count);
+ }
+
+ return newNode;
+ }
+
+
+ public void CreateNewRoot(byte* key, BTreeNode* newlySplitNode)
+ {
+ var memoryBlock = (IntPtr*)NativeMemory.AlignedAlloc((nuint)BTreeNode.PAGE_SIZE, (nuint)BTreeNode.PAGE_SIZE);
+ stats.numAllocates++;
+ BTreeNode* newRoot = BTreeNode.Create(BTreeNodeType.Internal, memoryBlock);
+
+ // Set the new root's key.
+ newRoot->info->count = 1;
+ newRoot->SetKey(0, key);
+
+ // Set children: left child is the old root; right child is the newly split node.
+ newRoot->SetChild(0, root);
+ newRoot->SetChild(1, newlySplitNode);
+
+ newRoot->info->validCount = root->info->validCount;
+ if (newlySplitNode != tail)
+ {
+ newRoot->info->validCount += newlySplitNode->info->validCount;
+ }
+ newRoot->info->next = newRoot->info->previous = null;
+ root = newRoot;
+ rootToTailLeaf[stats.depth] = newRoot;
+ stats.depth++;
+ stats.numInternalNodes++;
+ }
+ }
+}
\ No newline at end of file
diff --git a/libs/server/BTreeIndex/BTreeInternals.cs b/libs/server/BTreeIndex/BTreeInternals.cs
new file mode 100644
index 00000000000..2e55cd968d4
--- /dev/null
+++ b/libs/server/BTreeIndex/BTreeInternals.cs
@@ -0,0 +1,334 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System;
+using System.Numerics;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics.X86;
+
+namespace Garnet.server.BTreeIndex
+{
+
+ public enum BTreeNodeType
+ {
+ Internal,
+ Leaf
+ }
+
+ ///
+ /// Represents information stored in a node in the B+tree
+ ///
+ [StructLayout(LayoutKind.Explicit)]
+ public unsafe struct NodeData
+ {
+ [FieldOffset(0)]
+ public Value* values;
+ [FieldOffset(0)]
+ public BTreeNode** children;
+ }
+
+ [StructLayout(LayoutKind.Explicit, Size = sizeof(byte) + sizeof(ulong))]
+ public struct Value
+ {
+ [FieldOffset(0)]
+ public byte valid;
+ [FieldOffset(1)]
+ public ulong address;
+
+ public bool Valid
+ {
+ get
+ {
+ return valid == 1;
+ }
+ set
+ {
+ valid = (byte)(value ? 1 : 0);
+ }
+ }
+
+ public Value(ulong value)
+ {
+ this.valid = 1;
+ this.address = value;
+ }
+ }
+
+ public unsafe struct NodeInfo
+ {
+ public BTreeNodeType type;
+ public int count;
+ public BTreeNode* next;
+ public BTreeNode* previous;
+ public uint validCount; // valid keys (non-tombstone keys) in the node.
+ }
+
+ ///
+ /// Represents a node in the B+tree
+ /// Memory layout:
+ /// +-----------------------------------+
+ /// | BTreeNode (HEADER_SIZE bytes) |
+ /// | - NodeInfo* info |
+ /// | - NodeData data |
+ /// | - byte* keys |
+ /// | - IntPtr* memoryHandle |
+ /// +-----------------------------------+
+ /// | NodeInfo (METADATA_SIZE bytes) |
+ /// | - BTreeNodeType type |
+ /// | - int count |
+ /// | - BTreeNode* next |
+ /// | - BTreeNode* previous |
+ /// | - uint validCount |
+ /// +-----------------------------------+
+ /// | Keys array: capacity * KEY_SIZE |
+ /// +-----------------------------------+
+ /// | Data array: either Value[] (leaf) |
+ /// | or BTreeNode*[] (internal) |
+ /// +-----------------------------------+
+ /// Expects an allocated block of memory (of size BTreeNode.PAGE_SIZE) to be passed as handle
+ /// Stores handle for deallocation
+ /// BTreeNode struct also contained within the 4KB block to allow pointers to created nodes to be passed around
+ /// as well as allow for on-demand allocation/deallocation.
+ /// NOTE: currently reverted to MemoryMarshal for allocation of handles due to undefined behavior with SectorAlignedMemory.
+ ///
+ public unsafe struct BTreeNode
+ {
+ public static int HEADER_SIZE = sizeof(BTreeNode);
+ public static int PAGE_SIZE = 4096; // This must be increased if you want to store the BTreeNode header in the block.
+ public static int KEY_SIZE = 16; // key size in bytes.
+ public static int METADATA_SIZE = sizeof(NodeInfo);
+ public static int LEAF_CAPACITY = (PAGE_SIZE - HEADER_SIZE - METADATA_SIZE) / (KEY_SIZE + sizeof(Value));
+ public static int INTERNAL_CAPACITY = (PAGE_SIZE - HEADER_SIZE - METADATA_SIZE - sizeof(BTreeNode*)) / (KEY_SIZE + sizeof(BTreeNode*));
+
+ public NodeInfo* info;
+ public NodeData data;
+ public byte* keys;
+ public IntPtr* memoryHandle;
+
+ public static BTreeNode* Create(BTreeNodeType type, IntPtr* handle)
+ {
+ // Place the node header at the beginning of the block.
+ BTreeNode* node = (BTreeNode*)handle;
+ node->memoryHandle = handle;
+
+ // Define the start of the payload right after the header.
+ byte* payloadPtr = (byte*)(handle) + HEADER_SIZE;
+
+ // The NodeInfo will be stored at the start of the payload.
+ node->info = (NodeInfo*)payloadPtr;
+ node->info->type = type;
+ node->info->count = 0;
+ node->info->next = null;
+ node->info->previous = null;
+ node->info->validCount = 0;
+
+ // Data for keys follows the Nodeinfo->
+ byte* keysPtr = payloadPtr + METADATA_SIZE;
+ node->keys = keysPtr;
+
+ int capacity = (type == BTreeNodeType.Leaf) ? LEAF_CAPACITY : INTERNAL_CAPACITY;
+ int keysSize = capacity * KEY_SIZE;
+ byte* dataSectionPtr = keysPtr + keysSize;
+
+ // Set up NodeData in-place.
+ if (type == BTreeNodeType.Leaf)
+ {
+ node->data.values = (Value*)dataSectionPtr;
+ }
+ else
+ {
+ node->data.children = (BTreeNode**)dataSectionPtr;
+ }
+
+ return node;
+ }
+
+ public byte* GetKey(int index)
+ {
+ byte* keyAddress = keys + (index * KEY_SIZE);
+ return keyAddress;
+ }
+
+ public void SetKey(int index, byte* keyData)
+ {
+ var sourceSpan = new ReadOnlySpan(keyData, KEY_SIZE);
+ var destinationSpan = new Span(keys + (index * KEY_SIZE), KEY_SIZE);
+ sourceSpan.CopyTo(destinationSpan);
+ }
+
+ public void SetChild(int index, BTreeNode* child)
+ {
+ data.children[index] = child;
+ }
+
+ public BTreeNode* GetChild(int index)
+ {
+ return data.children[index];
+ }
+
+ public void SetValue(int index, Value value)
+ {
+ data.values[index] = value;
+ }
+
+ public Value GetValue(int index)
+ {
+ return data.values[index];
+ }
+
+ public void SetValueValid(int index, bool valid)
+ {
+ data.values[index].Valid = valid;
+ }
+
+ public bool IsValueValid(int index)
+ {
+ return data.values[index].Valid;
+ }
+
+ public void InsertTombstone(int index)
+ {
+ data.values[index].Valid = false;
+ }
+
+ ///
+ /// Returns the index of the first key greater than the given key
+ ///
+ ///
+ ///
+ public int UpperBound(byte* key)
+ {
+ if (info->count == 0)
+ {
+ return 0;
+ }
+ int left = 0, right = info->count - 1;
+ while (left <= right)
+ {
+ var mid = left + (right - left) / 2;
+ byte* midKey = GetKey(mid);
+ int cmp = Compare(key, midKey);
+ if (cmp < 0)
+ {
+ right = mid - 1;
+ }
+ else
+ {
+ left = mid + 1;
+ }
+ }
+ return left;
+ }
+
+ ///
+ /// Returns the index of the first key less than the given key
+ ///
+ ///
+ ///
+ public int LowerBound(byte* key)
+ {
+ if (info->count == 0)
+ {
+ return 0;
+ }
+ int left = 0, right = info->count - 1;
+ while (left <= right)
+ {
+ var mid = left + (right - left) / 2;
+ byte* midKey = GetKey(mid);
+ int cmp = Compare(midKey, key);
+ if (cmp == 0)
+ {
+ return mid;
+ }
+ else if (cmp < 0)
+ {
+ left = mid + 1;
+ }
+ else
+ {
+ right = mid - 1;
+ }
+ }
+ return left;
+ }
+
+ ///
+ /// Compares two keys
+ ///
+ ///
+ ///
+ /// -1 if key1 is less than key2; 0 if key1 == key2; 1 if key1 > key2
+ public static int Compare(byte* key1, byte* key2)
+ {
+
+ if (Sse2.IsSupported)
+ {
+ var v1 = Sse2.LoadVector128(key1);
+ var v2 = Sse2.LoadVector128(key2);
+
+ var mask = Sse2.MoveMask(Sse2.CompareEqual(v1, v2));
+
+ if (mask != 0xFFFF) // Not all bytes are equal
+ {
+ // Find the index of the first differing byte
+ int index = BitOperations.TrailingZeroCount(~mask); // Invert mask to find first zero (differing byte)
+ return key1[index] < key2[index] ? -1 : 1;
+ }
+
+ return 0; // Arrays are equal
+ }
+ else
+ {
+ return new Span(key1, KEY_SIZE).SequenceCompareTo(new Span(key2, KEY_SIZE));
+ }
+ }
+ }
+
+ ///
+ /// Statistics about the B+Tree
+ ///
+ public struct BTreeStats
+ {
+ // general index stats
+ public int depth;
+ public ulong numLeafNodes;
+ public ulong numInternalNodes;
+
+ // workload specific stats
+ public long totalInserts; // cumulative number of inserts to the index
+ public long totalDeletes; // cumulative number of deletes to the index
+ public ulong totalFastInserts; // cumulative number of fast inserts to the index
+ public long numKeys; // number of keys currently indexed
+ public ulong numValidKeys; // number of keys that are not tombstoned
+ public ulong numAllocates;
+ public ulong numDeallocates;
+ public BTreeStats()
+ {
+ depth = 0;
+ numLeafNodes = 0;
+ numInternalNodes = 0;
+ totalInserts = 0;
+ totalDeletes = 0;
+ totalFastInserts = 0;
+ numKeys = 0;
+ numValidKeys = 0;
+ numAllocates = 0;
+ numDeallocates = 0;
+ }
+
+ public void printStats()
+ {
+ Console.WriteLine($"Depth: {depth}");
+ Console.WriteLine($"Number of leaf nodes: {numLeafNodes}");
+ Console.WriteLine($"Number of internal nodes: {numInternalNodes}");
+ Console.WriteLine($"Total inserts: {totalInserts}");
+ Console.WriteLine($"Total deletes: {totalDeletes}");
+ Console.WriteLine($"Total fast inserts: {totalFastInserts}");
+ Console.WriteLine($"Number of keys: {numKeys}");
+ Console.WriteLine($"Number of valid keys: {numValidKeys}");
+ Console.WriteLine($"Number of allocates: {numAllocates}");
+ Console.WriteLine($"Number of deallocates: {numDeallocates}");
+ }
+ }
+}
\ No newline at end of file
diff --git a/libs/server/BTreeIndex/BTreeLookup.cs b/libs/server/BTreeIndex/BTreeLookup.cs
new file mode 100644
index 00000000000..bf73a5bbbae
--- /dev/null
+++ b/libs/server/BTreeIndex/BTreeLookup.cs
@@ -0,0 +1,156 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System.Collections.Generic;
+using System.Diagnostics;
+
+namespace Garnet.server.BTreeIndex
+{
+ public unsafe partial class BTree
+ {
+ ///
+ /// Point lookup in the index
+ ///
+ /// lookup key
+ ///
+ public Value Get(byte* key)
+ {
+ BTreeNode* leaf = null;
+ var nodesTraversed = new BTreeNode*[MAX_TREE_DEPTH];
+ TraverseToLeaf(ref leaf, ref nodesTraversed, key);
+
+ var index = leaf->LowerBound(key);
+ if (index < leaf->info->count && BTreeNode.Compare(key, leaf->GetKey(index)) == 0)
+ {
+ var value = leaf->GetValue(index);
+ if (value.Valid)
+ {
+ return value;
+ }
+ }
+ return default;
+ }
+
+ ///
+ /// Range lookup in the index
+ ///
+ /// start key for the range lookup
+ /// end key for the range lookup
+ /// address of the start key
+ /// address of end key
+ /// list of tombstones
+ /// limit entries scanned in the range lookup
+ /// reverse lookup
+ ///
+ public int Get(byte* start, byte* end, out Value startVal, out Value endVal, out List tombstones, long limit = -1, bool reverse = false)
+ {
+ Debug.Assert(reverse ?
+ BTreeNode.Compare(start, end) >= 0 : BTreeNode.Compare(start, end) <= 0,
+ "Start key should be less than or equal to end key");
+ int count = 0;
+ tombstones = new List();
+ BTreeNode* startLeaf = null, endLeaf = null;
+ BTreeNode*[] nodesTraversed = new BTreeNode*[MAX_TREE_DEPTH];
+ int startIndex, endIndex;
+
+ // find the leaf node for the start key
+ TraverseToLeaf(ref startLeaf, ref nodesTraversed, start);
+ // find the leaf node for the end key
+ TraverseToLeaf(ref endLeaf, ref nodesTraversed, end);
+
+ if (reverse)
+ {
+ // find the first slot > start and subtract one index to get the start index
+ startIndex = startLeaf->UpperBound(start) - 1;
+ startVal = startLeaf->GetValue(startIndex);
+
+ // find the first value greater than equal to key and that will be the last index
+ endIndex = endLeaf->LowerBound(end);
+ endVal = endLeaf->GetValue(endIndex);
+ }
+ else
+ {
+ // find the first key in the start leaf that is greater than or equal to the start key
+ startIndex = startLeaf->LowerBound(start);
+ startVal = startLeaf->GetValue(startIndex);
+ // find the last key in the end leaf that is less than or equal to the end key
+ endIndex = endLeaf->UpperBound(end) - 1;
+ endVal = endLeaf->GetValue(endIndex);
+ }
+
+ // iterate over the leaves between startLeaf[startIndex] and endLeaf[endIndex] (inclusive) and collect all tombstones
+ BTreeNode* leaf = startLeaf;
+ uint numScanned = 0;
+ while (leaf != null)
+ {
+ int first, last;
+ bool scanComplete = false;
+ if (reverse)
+ {
+ // we would like an inverse traversal
+ first = leaf == startLeaf ? startIndex : leaf->info->count - 1;
+ last = leaf == endLeaf ? endIndex : 0;
+ }
+ else
+ {
+ last = leaf == endLeaf ? endIndex : leaf->info->count - 1;
+ first = leaf == startLeaf ? startIndex : 0;
+ }
+
+ for (var i = first; ;)
+ {
+ numScanned++;
+ var value = leaf->GetValue(i);
+ if (!value.Valid)
+ {
+ tombstones.Add(leaf->GetValue(i));
+ }
+ else
+ {
+ // entry will be part of result set
+ count++;
+ if (limit != -1 && count >= limit)
+ {
+ // update address as required
+ if (reverse)
+ {
+ startVal = value;
+ }
+ else
+ {
+ endVal = value;
+ }
+ scanComplete = true;
+ break;
+ }
+ }
+
+ if (reverse)
+ {
+ if (i <= last)
+ {
+ break;
+ }
+ i--;
+ }
+ else
+ {
+ if (i >= last)
+ {
+ break;
+ }
+ i++;
+ }
+ }
+
+ if (leaf == endLeaf || scanComplete)
+ {
+ break;
+ }
+
+ leaf = reverse ? leaf->info->previous : leaf->info->next;
+ }
+ return count;
+ }
+ }
+}
\ No newline at end of file
diff --git a/libs/server/BTreeIndex/BTreeTraverse.cs b/libs/server/BTreeIndex/BTreeTraverse.cs
new file mode 100644
index 00000000000..8c9d482045f
--- /dev/null
+++ b/libs/server/BTreeIndex/BTreeTraverse.cs
@@ -0,0 +1,50 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+namespace Garnet.server.BTreeIndex
+{
+ public unsafe partial class BTree
+ {
+ public byte* TraverseToLeaf(ref BTreeNode* node, ref BTreeNode*[] nodesTraversed, byte* key)
+ {
+ byte* leafMax = null;
+ BTreeNode* child = root;
+ for (var i = stats.depth - 1; i > 0; --i)
+ {
+ node = child;
+ nodesTraversed[i] = child;
+ var slot = node->UpperBound(key);
+ if (slot != node->info->count)
+ {
+ leafMax = node->GetKey(slot);
+ }
+ child = node->GetChild(slot);
+ }
+ node = child;
+ nodesTraversed[0] = child;
+ return leafMax;
+ }
+
+ public byte* TraverseToLeaf(ref BTreeNode* node, ref BTreeNode*[] nodesTraversed, byte* key, out int[] slots)
+ {
+ slots = new int[MAX_TREE_DEPTH];
+ byte* leafMax = null;
+ BTreeNode* child = root;
+ for (var i = stats.depth - 1; i > 0; --i)
+ {
+ node = child;
+ nodesTraversed[i] = child;
+ var slot = node->UpperBound(key);
+ slots[i] = slot;
+ if (slot != node->info->count)
+ {
+ leafMax = node->GetKey(slot);
+ }
+ child = node->GetChild(slot);
+ }
+ node = child;
+ nodesTraversed[0] = child;
+ return leafMax;
+ }
+ }
+}
\ No newline at end of file
diff --git a/libs/server/BTreeIndex/BTreeTrim.cs b/libs/server/BTreeIndex/BTreeTrim.cs
new file mode 100644
index 00000000000..72a635992e7
--- /dev/null
+++ b/libs/server/BTreeIndex/BTreeTrim.cs
@@ -0,0 +1,346 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System;
+using System.Diagnostics;
+
+namespace Garnet.server.BTreeIndex
+{
+ public unsafe partial class BTree
+ {
+ public void TrimByID(byte* key, out int underflowingNodes, out ulong entriesTrimmed, out Value headValidValue, out ReadOnlySpan headValidKey, out uint numLeavesDeleted)
+ {
+ underflowingNodes = 0;
+ entriesTrimmed = 0;
+ numLeavesDeleted = 0;
+
+ var nodesTraversed = new BTreeNode*[MAX_TREE_DEPTH];
+ BTreeNode* leaf = null;
+ TraverseToLeaf(ref leaf, ref nodesTraversed, key, out int[] internalSlots);
+
+ // find index for key in leaf node - this returns the index of first key >= given key
+ var index = leaf->LowerBound(key);
+ headValidKey = new ReadOnlySpan(leaf->GetKey(index), BTreeNode.KEY_SIZE);
+ headValidValue = leaf->GetValue(index);
+
+ // insert tombstones until index to mark as deleted
+ for (var i = 0; i < index; i++)
+ {
+ leaf->SetValueValid(i, false);
+ leaf->info->validCount--;
+ entriesTrimmed++;
+ }
+
+ if (leaf == head)
+ {
+ numLeavesDeleted = 0;
+ return;
+ }
+
+ // traverse the leaf level to delete preceding leaf nodes
+ var node = leaf->info->previous;
+ var nodesToTraverseInSubtree = internalSlots[1] - 1;
+ uint deletedValidCount = (uint)(leaf->info->count - leaf->info->validCount);
+ var totalDeletedValidCount = deletedValidCount;
+ while (node != null)
+ {
+ var validCount = node->info->validCount;
+ var count = node->info->count;
+ if (nodesToTraverseInSubtree >= 0)
+ {
+ deletedValidCount += validCount;
+ nodesToTraverseInSubtree--;
+ }
+ totalDeletedValidCount += validCount;
+
+ var prev = node->info->previous;
+ if (prev == null)
+ {
+ Debug.Assert(node == head, "Head node should not have a previous node");
+ }
+
+ stats.numLeafNodes--;
+ stats.numKeys -= count;
+ stats.numValidKeys -= validCount;
+ entriesTrimmed += validCount;
+
+ // deallocate the node
+ Deallocate(ref node);
+ numLeavesDeleted++;
+
+ // continue iteration
+ node = prev;
+ }
+
+ leaf->info->previous = null;
+ head = leaf;
+
+ bool rootReassigned = false;
+ // traverse internal nodes except root and delete preceding internal nodes
+ for (int i = 1; i < stats.depth - 1; i++)
+ {
+ node = nodesTraversed[i];
+ var slotOfKey = internalSlots[i];
+
+ if (slotOfKey > 0)
+ {
+ // shift children leftwards until slotOfKey (inclusive) using ReadOnlySpan
+ var sourceSpan = new ReadOnlySpan(node->keys + (slotOfKey - 1) * BTreeNode.KEY_SIZE, ((slotOfKey - 1)) * BTreeNode.KEY_SIZE);
+ var destinationSpan = new Span(node->keys, ((slotOfKey - 1)) * BTreeNode.KEY_SIZE);
+ sourceSpan.CopyTo(destinationSpan);
+
+ var sourceChildrenSpan = new ReadOnlySpan(node->data.children + (slotOfKey - 1) + 1, ((slotOfKey - 1)) * sizeof(BTreeNode*));
+ var destinationChildrenSpan = new Span(node->data.children, ((slotOfKey - 1)) * sizeof(BTreeNode*));
+ sourceChildrenSpan.CopyTo(destinationChildrenSpan);
+ }
+ var prevCount = node->info->count;
+ node->info->count -= slotOfKey;
+ node->info->validCount -= deletedValidCount;
+
+ if (prevCount > BTreeNode.INTERNAL_CAPACITY / 2 && node->info->count < BTreeNode.INTERNAL_CAPACITY / 2)
+ {
+ underflowingNodes++;
+ }
+
+ node = nodesTraversed[i]->info->previous;
+ deletedValidCount = 0;
+ while (node != null)
+ {
+ var temp = node->info->previous;
+ if (nodesToTraverseInSubtree >= 0)
+ {
+ deletedValidCount += node->info->validCount;
+ nodesToTraverseInSubtree--;
+ }
+ Deallocate(ref node);
+ stats.numInternalNodes--;
+ node = temp;
+ }
+ nodesTraversed[i]->info->previous = null;
+ // corner case: slotOfKey points to last child => after deletion only one child remains
+ // delete all partent levels and re-assign root
+ if (i + 1 < stats.depth)
+ {
+ var nextSlot = internalSlots[i + 1];
+ if (nextSlot == nodesTraversed[i + 1]->info->count)
+ {
+ var newRoot = nodesTraversed[i];
+ var originalDepth = stats.depth;
+ for (int j = i + 1; j < originalDepth; j++)
+ {
+ var curr = nodesTraversed[j];
+ while (curr != null)
+ {
+ var pre = curr->info->previous;
+ Deallocate(ref curr);
+ stats.numInternalNodes--;
+ curr = pre;
+ }
+ stats.depth--;
+ }
+ root = newRoot;
+ rootReassigned = true;
+ break;
+ }
+ }
+ }
+ if (!rootReassigned && stats.depth > 1 && nodesTraversed[stats.depth - 1] != null)
+ {
+ nodesTraversed[stats.depth - 1]->info->validCount -= totalDeletedValidCount;
+ }
+ }
+
+ public void TrimByLength(ref BTreeNode* node, ulong length, out ulong entriesTrimmed, out Value headValidValue, out ReadOnlySpan headValidKey, out uint numLeavesDeleted, bool approximateTrimming)
+ {
+ var depth = stats.depth - 1;
+ ulong currentValidCount = 0;
+ var current = node;
+ int[] internalSlots = new int[MAX_TREE_DEPTH];
+ int underflowingNodes = 0;
+ entriesTrimmed = 0;
+ numLeavesDeleted = 0;
+ headValidKey = default;
+ BTreeNode*[] nodesTraversed = new BTreeNode*[MAX_TREE_DEPTH];
+
+ if (length >= stats.numValidKeys)
+ {
+ headValidValue = current->GetValue(0);
+ headValidKey = new ReadOnlySpan(current->GetKey(0), BTreeNode.KEY_SIZE);
+ return;
+ }
+
+ nodesTraversed[depth] = current;
+ while (depth > 0)
+ {
+ if (current->info->type == BTreeNodeType.Internal)
+ {
+ for (var i = current->info->count; i >= 0; i--)
+ {
+ var child = current->GetChild(i);
+ if (currentValidCount + child->info->validCount >= length)
+ {
+ nodesTraversed[depth - 1] = child;
+ internalSlots[depth] = i;
+ current = child;
+ break;
+ }
+ else
+ {
+ currentValidCount += child->info->validCount;
+ }
+ }
+ }
+ depth--;
+ }
+
+ if (approximateTrimming)
+ {
+ headValidValue = current->GetValue(0);
+ headValidKey = new ReadOnlySpan(current->GetKey(0), BTreeNode.KEY_SIZE);
+ }
+ else
+ {
+ ulong keepInCurrent = length - currentValidCount;
+ ulong kept = 0;
+ headValidValue = default;
+ headValidKey = default;
+ for (int i = 0; i < current->info->count; i++)
+ {
+ if (current->IsValueValid(i))
+ {
+ if (kept < keepInCurrent)
+ {
+ // Keep this key
+ if (kept == 0)
+ {
+ headValidValue = current->GetValue(i);
+ headValidKey = new ReadOnlySpan(current->GetKey(i), BTreeNode.KEY_SIZE);
+ }
+ kept++;
+ }
+ else
+ {
+ // Mark as deleted
+ current->SetValueValid(i, false);
+ current->info->validCount--;
+ entriesTrimmed++;
+ stats.numValidKeys--;
+ }
+ }
+ }
+ }
+
+ var leaf = current->info->previous;
+ uint deletedValidCount = 0;
+ var nodesToTraverseInSubtree = internalSlots[depth + 1] - 1;
+ while (leaf != null)
+ {
+ var count = leaf->info->count;
+ var validCount = leaf->info->validCount;
+
+ if (nodesToTraverseInSubtree >= 0)
+ {
+ deletedValidCount += validCount;
+ nodesToTraverseInSubtree--;
+ }
+ var prev = leaf->info->previous;
+ if (prev == null)
+ {
+ Debug.Assert(leaf == head, "Head node should not have a previous node");
+ }
+ stats.numLeafNodes--;
+ stats.numKeys -= count;
+ stats.numValidKeys -= validCount;
+ entriesTrimmed += validCount;
+
+ // deallocate the node
+ Deallocate(ref leaf);
+ numLeavesDeleted++;
+ leaf = prev;
+ }
+ current->info->previous = null;
+ head = current;
+ // traverse the internal nodes except root and delete preceding internal nodes
+ for (int i = 1; i < stats.depth - 1; i++)
+ {
+ var slotOfKey = internalSlots[i];
+ var inner = nodesTraversed[i];
+ if (inner == null)
+ {
+ break;
+ }
+ if (slotOfKey > 0)
+ {
+ // shift keys and children from slotOfKey to beginning
+ var sourceSpan = new ReadOnlySpan(inner->keys + (slotOfKey - 1) * BTreeNode.KEY_SIZE, ((slotOfKey - 1)) * BTreeNode.KEY_SIZE);
+ var destinationSpan = new Span(inner->keys, ((slotOfKey - 1)) * BTreeNode.KEY_SIZE);
+ sourceSpan.CopyTo(destinationSpan);
+
+ var sourceChildrenSpan = new ReadOnlySpan(inner->data.children + (slotOfKey - 1) + 1, ((slotOfKey - 1)) * sizeof(BTreeNode*));
+ var destinationChildrenSpan = new Span(inner->data.children, ((slotOfKey - 1)) * sizeof(BTreeNode*));
+ sourceChildrenSpan.CopyTo(destinationChildrenSpan);
+ }
+ var prevCount = inner->info->count;
+ inner->info->count -= slotOfKey;
+ nodesTraversed[i]->info->validCount -= deletedValidCount;
+
+ if (prevCount > BTreeNode.INTERNAL_CAPACITY / 2 && inner->info->count < BTreeNode.INTERNAL_CAPACITY / 2)
+ {
+ underflowingNodes++;
+ }
+ deletedValidCount = 0;
+ nodesToTraverseInSubtree = slotOfKey - 1;
+ inner = inner->info->previous;
+ while (inner != null && inner != root)
+ {
+ var temp = inner->info->previous;
+ if (nodesToTraverseInSubtree >= 0)
+ {
+ deletedValidCount += inner->info->validCount;
+ nodesToTraverseInSubtree--;
+ }
+ Deallocate(ref inner);
+ stats.numInternalNodes--;
+ inner = temp;
+ }
+ nodesTraversed[i]->info->previous = null;
+ // corner case: slotOfKey points to last child => after deletion only one child remains
+ // delete all parent levels and re-assign root
+ if (i + 1 < stats.depth)
+ {
+ var nextSlot = internalSlots[i + 1];
+ if (nextSlot == nodesTraversed[i + 1]->info->count)
+ {
+ var newRoot = nodesTraversed[i];
+ var originalDepth = stats.depth;
+ for (int j = i + 1; j < originalDepth; j++)
+ {
+ var curr = nodesTraversed[j];
+ while (curr != null)
+ {
+ var pre = curr->info->previous;
+ Deallocate(ref curr);
+ stats.numInternalNodes--;
+ curr = pre;
+ }
+ stats.depth--;
+ }
+ root = newRoot;
+ break;
+ }
+ }
+ }
+ }
+ public void TrimByID(byte* key, out ulong entriesTrimmed, out Value headValue, out ReadOnlySpan headValidKey, out uint numLeavesDeleted)
+ {
+ int underflowingNodes;
+ TrimByID(key, out underflowingNodes, out entriesTrimmed, out headValue, out headValidKey, out numLeavesDeleted);
+ }
+
+ public void TrimByLength(ulong length, out ulong entriesTrimmed, out Value headValue, out ReadOnlySpan headValidKey, out uint numLeavesDeleted, bool approximateTrimming = false)
+ {
+
+ TrimByLength(ref root, length, out entriesTrimmed, out headValue, out headValidKey, out numLeavesDeleted, approximateTrimming);
+ }
+ }
+}
\ No newline at end of file
diff --git a/libs/server/Resp/CmdStrings.cs b/libs/server/Resp/CmdStrings.cs
index 8a11599def4..daafab604a9 100644
--- a/libs/server/Resp/CmdStrings.cs
+++ b/libs/server/Resp/CmdStrings.cs
@@ -295,6 +295,15 @@ static partial class CmdStrings
public static ReadOnlySpan RESP_ERR_ZSET_MEMBER => "ERR could not decode requested zset member"u8;
public static ReadOnlySpan RESP_ERR_EXPDELSCAN_INVALID => "ERR Cannot execute EXPDELSCAN with background expired key deletion scan enabled"u8;
+ public static ReadOnlySpan RESP_ERR_STREAMS_DISABLED => "ERR STREAMS is disabled, enable it with --streams option."u8;
+ public static ReadOnlySpan RESP_ERR_XADD_WRONG_NUM_ARGS => "ERR wrong number of arguments for 'xadd' command"u8;
+ public static ReadOnlySpan RESP_ERR_XADD_INVALID_STREAM_ID => "ERR Invalid stream ID specified as stream command argument"u8;
+ public static ReadOnlySpan RESP_ERR_XADD_ID_NOT_GREATER => "ERR The ID specified in XADD is equal or smaller than the target stream top item"u8;
+ public static ReadOnlySpan RESP_ERR_XLEN_WRONG_NUM_ARGS => "ERR wrong number of arguments for 'xlen' command"u8;
+ public static ReadOnlySpan RESP_ERR_XRANGE_WRONG_NUM_ARGS => "ERR wrong number of arguments for 'xrange' command"u8;
+ public static ReadOnlySpan RESP_ERR_XDEL_WRONG_NUM_ARGS => "ERR wrong number of arguments for 'xdel' command"u8;
+ public static ReadOnlySpan RESP_ERR_XTRIM_WRONG_NUM_ARGS => "ERR wrong number of arguments for 'xtrim' command"u8;
+
///
/// Response string templates
///
diff --git a/libs/server/Resp/Parser/RespCommand.cs b/libs/server/Resp/Parser/RespCommand.cs
index 5f17ab37e1d..15a6641d750 100644
--- a/libs/server/Resp/Parser/RespCommand.cs
+++ b/libs/server/Resp/Parser/RespCommand.cs
@@ -195,6 +195,11 @@ public enum RespCommand : ushort
SUNIONSTORE,
SWAPDB,
UNLINK,
+ XADD,
+ XLEN,
+ XRANGE,
+ XDEL,
+ XTRIM,
ZADD,
ZCOLLECT,
ZDIFFSTORE,
@@ -958,6 +963,21 @@ private RespCommand FastParseArrayCommand(ref int count, ref ReadOnlySpan
}
break;
+ case 'X':
+ if (*(ulong*)(ptr + 2) == MemoryMarshal.Read("\r\nXADD\r\n"u8))
+ {
+ return RespCommand.XADD;
+ }
+ else if (*(ulong*)(ptr + 2) == MemoryMarshal.Read("\r\nXLEN\r\n"u8))
+ {
+ return RespCommand.XLEN;
+ }
+ else if (*(ulong*)(ptr + 2) == MemoryMarshal.Read("\r\nXDEL\r\n"u8))
+ {
+ return RespCommand.XDEL;
+ }
+ break;
+
case 'Z':
if (*(ulong*)(ptr + 2) == MemoryMarshal.Read("\r\nZADD\r\n"u8))
{
@@ -1124,6 +1144,12 @@ private RespCommand FastParseArrayCommand(ref int count, ref ReadOnlySpan
return RespCommand.WATCH;
}
break;
+ case 'X':
+ if (*(ulong*)(ptr + 3) == MemoryMarshal.Read("\nXTRIM\r\n"u8))
+ {
+ return RespCommand.XTRIM;
+ }
+ break;
case 'Z':
if (*(ulong*)(ptr + 3) == MemoryMarshal.Read("\nZCARD\r\n"u8))
@@ -1312,6 +1338,13 @@ private RespCommand FastParseArrayCommand(ref int count, ref ReadOnlySpan
}
break;
+ case 'X':
+ if (*(ulong*)(ptr + 4) == MemoryMarshal.Read("XRANGE\r\n"u8))
+ {
+ return RespCommand.XRANGE;
+ }
+ break;
+
case 'Z':
if (*(ulong*)(ptr + 4) == MemoryMarshal.Read("ZCOUNT\r\n"u8))
{
diff --git a/libs/server/Resp/RespServerSession.cs b/libs/server/Resp/RespServerSession.cs
index 4734be1273f..15f9b7aa487 100644
--- a/libs/server/Resp/RespServerSession.cs
+++ b/libs/server/Resp/RespServerSession.cs
@@ -207,6 +207,8 @@ internal sealed unsafe partial class RespServerSession : ServerSessionBase
// Threshold for slow log in ticks (0 means disabled)
readonly long slowLogThreshold;
+ internal readonly SessionStreamCache sessionStreamCache;
+
///
/// Create a new RESP server session
///
@@ -286,6 +288,13 @@ public RespServerSession(
if (this.networkSender.GetMaxSizeSettings?.MaxOutputSize < sizeof(int))
this.networkSender.GetMaxSizeSettings.MaxOutputSize = sizeof(int);
}
+
+ // grab stream manager from storeWrapper
+ if (storeWrapper.serverOptions.EnableStreams)
+ {
+ this.streamManager = storeWrapper.streamManager;
+ sessionStreamCache = new SessionStreamCache();
+ }
}
///
@@ -932,6 +941,12 @@ private bool ProcessArrayCommands(RespCommand cmd, ref TGarnetApi st
RespCommand.SUNIONSTORE => SetUnionStore(ref storageApi),
RespCommand.SDIFF => SetDiff(ref storageApi),
RespCommand.SDIFFSTORE => SetDiffStore(ref storageApi),
+ // Stream Commands
+ RespCommand.XADD => StreamAdd(respProtocolVersion),
+ RespCommand.XLEN => StreamLength(),
+ RespCommand.XDEL => StreamDelete(),
+ RespCommand.XRANGE => StreamRange(respProtocolVersion),
+ RespCommand.XTRIM => StreamTrim(),
_ => ProcessOtherCommands(cmd, ref storageApi)
};
return success;
diff --git a/libs/server/Resp/StreamCommands.cs b/libs/server/Resp/StreamCommands.cs
new file mode 100644
index 00000000000..35df8615a8c
--- /dev/null
+++ b/libs/server/Resp/StreamCommands.cs
@@ -0,0 +1,301 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System;
+using Garnet.common;
+using Tsavorite.core;
+
+namespace Garnet.server
+{
+ internal sealed unsafe partial class RespServerSession : ServerSessionBase
+ {
+ readonly StreamManager streamManager;
+ ///
+ /// Adds a new entry to the stream.
+ ///
+ /// true if stream was added successfully; error otherwise
+ private unsafe bool StreamAdd(byte respProtocolVersion)
+ {
+ if (parseState.Count < 4)
+ {
+ return AbortWithErrorMessage(CmdStrings.RESP_ERR_XADD_WRONG_NUM_ARGS);
+ }
+
+ int argsParsed = 0;
+
+ // Parse the stream key.
+ var key = parseState.GetArgSliceByRef(0);
+ argsParsed++;
+
+ bool noMkStream = false;
+ if (argsParsed < parseState.Count && parseState.GetArgSliceByRef(argsParsed).ToString().ToUpper().Equals("NOMKSTREAM"))
+ {
+ noMkStream = true;
+ argsParsed++;
+ }
+
+ // Parse the id. We parse as string for easy pattern matching.
+ var idGiven = parseState.GetArgSliceByRef(argsParsed);
+
+ // get the number of the remaining key-value pairs
+ var numPairs = parseState.Count - argsParsed;
+
+ // grab the rest of the input that will mainly be k-v pairs as entry to the stream.
+ byte* vPtr = parseState.GetArgSliceByRef(argsParsed).ptr - sizeof(int);
+ int vsize = (int)(recvBufferPtr + endReadHead - vPtr);
+ var streamDataSpan = new ReadOnlySpan(vPtr, vsize);
+ SpanByteAndMemory _output = new SpanByteAndMemory(dcurr, (int)(dend - dcurr));
+
+ var disabledStreams = streamManager == null;
+ if (disabledStreams)
+ {
+ while (!RespWriteUtils.TryWriteError(CmdStrings.RESP_ERR_STREAMS_DISABLED, ref dcurr, dend))
+ SendAndReset();
+ return true;
+ }
+
+
+ if (sessionStreamCache.TryGetStreamFromCache(key.Span, out StreamObject cachedStream))
+ {
+ cachedStream.AddEntry(streamDataSpan, vsize, idGiven, numPairs, ref _output, respProtocolVersion);
+ }
+ else
+ {
+ streamManager.StreamAdd(key, idGiven, noMkStream, streamDataSpan, vsize, numPairs, ref _output, out byte[] lastStreamKey, out StreamObject lastStream, respProtocolVersion);
+ // since we added to a new stream that was not in the cache, try adding it to the cache
+ if (lastStream != null)
+ {
+ sessionStreamCache.TryAddStreamToCache(lastStreamKey, lastStream);
+ }
+ }
+ ProcessOutput(_output);
+ return true;
+ }
+
+ ///
+ /// Retrieves the length of the stream.
+ ///
+ /// true if stream length was retrieved successfully; error otherwise
+ private bool StreamLength()
+ {
+ if (parseState.Count != 1)
+ {
+ return AbortWithErrorMessage(CmdStrings.RESP_ERR_XLEN_WRONG_NUM_ARGS);
+ }
+ // parse the stream key.
+ var key = parseState.GetArgSliceByRef(0);
+
+ ulong streamLength;
+
+ var disabledStreams = streamManager == null;
+ if (disabledStreams)
+ {
+ while (!RespWriteUtils.TryWriteError(CmdStrings.RESP_ERR_STREAMS_DISABLED, ref dcurr, dend))
+ SendAndReset();
+ return true;
+ }
+
+ // check if the stream exists in cache
+ if (sessionStreamCache.TryGetStreamFromCache(key.Span, out StreamObject cachedStream))
+ {
+ streamLength = cachedStream.Length();
+ }
+ else
+ {
+ streamLength = streamManager.StreamLength(key);
+ }
+ // write back result
+ while (!RespWriteUtils.TryWriteInt64((long)streamLength, ref dcurr, dend))
+ SendAndReset();
+ return true;
+ }
+
+ ///
+ /// Retrieves a range of stream entries.
+ ///
+ /// true if range of stream entries were retrieved successfully; error otherwise
+ public unsafe bool StreamRange(byte respProtocolVersion)
+ {
+ // command is of format: XRANGE key start end [COUNT count]
+ // we expect at least 3 arguments
+ if (parseState.Count < 3)
+ {
+ return AbortWithErrorMessage(CmdStrings.RESP_ERR_XRANGE_WRONG_NUM_ARGS);
+ }
+
+ // parse the stream key
+ var key = parseState.GetArgSliceByRef(0);
+
+ // parse start and end IDs
+ var startId = parseState.GetArgSliceByRef(1).ToString();
+ var endId = parseState.GetArgSliceByRef(2).ToString();
+
+ int count = -1;
+ if (parseState.Count > 3)
+ {
+ // parse the count argument
+ var countStr = parseState.GetArgSliceByRef(4).ToString();
+ if (!int.TryParse(countStr, out count))
+ {
+ return AbortWithErrorMessage(CmdStrings.RESP_ERR_GENERIC_SYNTAX_ERROR);
+ }
+ }
+
+ SpanByteAndMemory _output = new SpanByteAndMemory(dcurr, (int)(dend - dcurr));
+
+ var disabledStreams = streamManager == null;
+ if (disabledStreams)
+ {
+ while (!RespWriteUtils.TryWriteError(CmdStrings.RESP_ERR_STREAMS_DISABLED, ref dcurr, dend))
+ SendAndReset();
+ return true;
+ }
+
+ bool success = false;
+
+ // check if the stream exists in cache
+ if (sessionStreamCache.TryGetStreamFromCache(key.Span, out StreamObject cachedStream))
+ {
+ cachedStream.ReadRange(startId, endId, count, ref _output, respProtocolVersion);
+ success = true;
+ }
+ else
+ {
+ success = streamManager.StreamRange(key, startId, endId, count, ref _output, respProtocolVersion);
+ }
+ if (success)
+ {
+ // _ = ProcessOutputWithHeader(_output);
+ ProcessOutput(_output);
+ }
+ else
+ {
+ //return empty array
+ while (!RespWriteUtils.TryWriteArrayLength(0, ref dcurr, dend))
+ SendAndReset();
+ return true;
+ }
+
+ return true;
+ }
+
+ ///
+ /// Deletes stream entry(s).
+ ///
+ /// true if stream entry(s) was deleted successfully; error otherwise
+ public bool StreamDelete()
+ {
+ // command is of format: XDEL key id [id ...]
+ // we expect at least 2 arguments
+ if (parseState.Count < 2)
+ {
+ return AbortWithErrorMessage(CmdStrings.RESP_ERR_XDEL_WRONG_NUM_ARGS);
+ }
+
+ // parse the stream key
+ var key = parseState.GetArgSliceByRef(0);
+ int deletedCount = 0;
+
+ var disabledStreams = streamManager == null;
+ if (disabledStreams)
+ {
+ while (!RespWriteUtils.TryWriteError(CmdStrings.RESP_ERR_STREAMS_DISABLED, ref dcurr, dend))
+ SendAndReset();
+ return true;
+ }
+
+ // for every id, parse and delete the stream entry
+ for (int i = 1; i < parseState.Count; i++)
+ {
+ // parse the id as string
+ var idGiven = parseState.GetArgSliceByRef(i);
+
+ bool deleted;
+ // check if the stream exists in cache
+ if (sessionStreamCache.TryGetStreamFromCache(key.Span, out StreamObject cachedStream))
+ {
+ deleted = cachedStream.DeleteEntry(idGiven);
+ }
+ else
+ {
+ // delete the entry in the stream from the streamManager
+ deleted = streamManager.StreamDelete(key, idGiven, out StreamObject lastStream);
+ if (lastStream != null)
+ {
+ // since we deleted from a stream that was not in the cache, try adding it to the cache
+ sessionStreamCache.TryAddStreamToCache(key.ToArray(), lastStream);
+ }
+ }
+
+ deletedCount = deleted ? deletedCount + 1 : deletedCount;
+ }
+
+ // write back the number of entries deleted
+ while (!RespWriteUtils.TryWriteInt64(deletedCount, ref dcurr, dend))
+ SendAndReset();
+ return true;
+ }
+
+ ///
+ /// Trims the stream to the specified length or ID.
+ ///
+ /// returns true if stream was trimmed successfully; error otherwise
+ public bool StreamTrim()
+ {
+ if (parseState.Count < 3)
+ {
+ return AbortWithErrorMessage(CmdStrings.RESP_ERR_XTRIM_WRONG_NUM_ARGS);
+ }
+
+ var key = parseState.GetArgSliceByRef(0);
+ var trimType = parseState.GetArgSliceByRef(1).ToString().ToUpper();
+ bool approximate = false;
+ int trimArgIndex = 2;
+ // Check for optional ~
+ if (parseState.Count > 3 && parseState.GetArgSliceByRef(2).ToString() == "~")
+ {
+ approximate = true;
+ trimArgIndex++;
+ }
+ var trimArg = parseState.GetArgSliceByRef(trimArgIndex);
+
+ ulong entriesTrimmed = 0;
+ StreamTrimOpts optType = StreamTrimOpts.NONE;
+ switch (trimType)
+ {
+ case "MAXLEN":
+ optType = StreamTrimOpts.MAXLEN;
+ break;
+ case "MINID":
+ optType = StreamTrimOpts.MINID;
+ break;
+ }
+
+ var disabledStreams = streamManager == null;
+ if (disabledStreams)
+ {
+ while (!RespWriteUtils.TryWriteError(CmdStrings.RESP_ERR_STREAMS_DISABLED, ref dcurr, dend))
+ SendAndReset();
+ return true;
+ }
+
+ bool result;
+ if (sessionStreamCache.TryGetStreamFromCache(key.Span, out StreamObject cachedStream))
+ {
+ result = cachedStream.Trim(trimArg, optType, out entriesTrimmed, approximate);
+ }
+ else
+ {
+ result = streamManager.StreamTrim(key, trimArg, optType, out entriesTrimmed, approximate);
+ }
+ if (!result)
+ {
+ return AbortWithErrorMessage(CmdStrings.RESP_ERR_GENERIC_SYNTAX_ERROR);
+ }
+ while (!RespWriteUtils.TryWriteInt64((long)entriesTrimmed, ref dcurr, dend))
+ SendAndReset();
+ return true;
+ }
+
+ }
+}
\ No newline at end of file
diff --git a/libs/server/Servers/GarnetServerOptions.cs b/libs/server/Servers/GarnetServerOptions.cs
index 5329d1992fd..c87b7328082 100644
--- a/libs/server/Servers/GarnetServerOptions.cs
+++ b/libs/server/Servers/GarnetServerOptions.cs
@@ -529,6 +529,19 @@ public string GetObjectStoreCheckpointDirectory(int dbId) =>
public string GetAppendOnlyFileDirectory(int dbId) =>
Path.Combine(AppendOnlyFileBaseDirectory, GetAppendOnlyFileDirectoryName(dbId));
+ // Enable STREAMS on server
+ public bool EnableStreams = false;
+
+ ///
+ /// Page size for BTree index for STREAM
+ ///
+ public string StreamPageSize = "4m";
+
+ ///
+ /// Memory for STREAM
+ ///
+ public string StreamMemorySize = "1g";
+
///
/// Constructor
///
@@ -691,6 +704,32 @@ public KVSettings GetSettings(ILoggerFactory loggerFactory,
return kvSettings;
}
+ ///
+ /// Get stream page size
+ ///
+ ///
+ public long StreamPageSizeBytes()
+ {
+ long size = ParseSize(StreamPageSize);
+ long adjustedSize = PreviousPowerOf2(size);
+ if (size != adjustedSize)
+ logger?.LogInformation($"Warning: using lower stream page size than specified (power of 2)");
+ return adjustedSize;
+ }
+
+ ///
+ /// Get stream memory size
+ ///
+ ///
+ public long StreamMemorySizeBytes()
+ {
+ long size = ParseSize(StreamMemorySize);
+ long adjustedSize = PreviousPowerOf2(size);
+ if (size != adjustedSize)
+ logger?.LogInformation($"Warning: using lower stream page size than specified (power of 2)");
+ return adjustedSize;
+ }
+
///
/// Get memory size
///
diff --git a/libs/server/StoreWrapper.cs b/libs/server/StoreWrapper.cs
index e8569eda2b1..f218fc54f8a 100644
--- a/libs/server/StoreWrapper.cs
+++ b/libs/server/StoreWrapper.cs
@@ -163,6 +163,8 @@ public sealed class StoreWrapper
///
public GarnetCheckpointManager ObjectStoreCheckpointManager => (GarnetCheckpointManager)objectStore?.CheckpointManager;
+ internal readonly StreamManager streamManager;
+
///
/// Constructor
///
@@ -270,6 +272,10 @@ public StoreWrapper(
ObjectStoreCheckpointManager.CurrentHistoryId = runId;
}
}
+ if (serverOptions.EnableStreams)
+ {
+ this.streamManager = new StreamManager(serverOptions.StreamPageSizeBytes(), serverOptions.StreamMemorySizeBytes(), 0);
+ }
}
///
@@ -288,6 +294,11 @@ public StoreWrapper(StoreWrapper storeWrapper, bool recordToAof) : this(storeWra
clusterFactory: null,
loggerFactory: storeWrapper.loggerFactory)
{
+ // initialize stream manager
+ if (serverOptions.EnableStreams)
+ {
+ this.streamManager = new StreamManager(serverOptions.StreamPageSizeBytes(), serverOptions.StreamMemorySizeBytes(), 0);
+ }
}
///
diff --git a/libs/server/Stream/SessionStreamCache.cs b/libs/server/Stream/SessionStreamCache.cs
new file mode 100644
index 00000000000..4f6ee3ba288
--- /dev/null
+++ b/libs/server/Stream/SessionStreamCache.cs
@@ -0,0 +1,59 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System;
+using System.Collections.Generic;
+
+namespace Garnet.server
+{
+ internal class SessionStreamCache
+ {
+ const int DefaultCacheSize = 16;
+ readonly Dictionary streamCache = new Dictionary(DefaultCacheSize, ByteArrayComparer.Instance);
+ readonly byte[][] streamKeysCache = new byte[DefaultCacheSize][];
+ int cachedStreamsCount = 0;
+ int front = 0;
+
+ public SessionStreamCache()
+ { }
+
+ ///
+ /// Lookup a stream in the cahce. Since the cache is expected to be small, we can sequentially scan.
+ ///
+ /// name of stream to lookup
+ /// stream found from the cache
+ /// true if stream exists in cache
+ public bool TryGetStreamFromCache(ReadOnlySpan key, out StreamObject stream)
+ {
+ return streamCache.TryGetValue(key.ToArray(), out stream);
+ }
+
+ ///
+ /// Add a stream to the cache. If the cache is full, we don't add the stream.
+ ///
+ /// name of stream
+ /// reference to stream object
+ /// true if successfully added
+ public bool TryAddStreamToCache(byte[] key, StreamObject stream)
+ {
+ if (cachedStreamsCount < DefaultCacheSize)
+ {
+ streamCache.Add(key, stream);
+ // add to circular array and update front
+ streamKeysCache[front] = key;
+ front = (front + 1) % DefaultCacheSize;
+ cachedStreamsCount++;
+ return true;
+ }
+
+ streamCache.Remove(streamKeysCache[front]);
+ streamCache.Add(key, stream);
+ // add to circular array where we removed the oldest stream
+ streamKeysCache[front] = key;
+ front = (front + 1) % DefaultCacheSize;
+ // we don't need to update cachedStreamsCount since we added and removed a stream
+ return true;
+
+ }
+ }
+}
\ No newline at end of file
diff --git a/libs/server/Stream/Stream.cs b/libs/server/Stream/Stream.cs
new file mode 100644
index 00000000000..6b74ca6ccee
--- /dev/null
+++ b/libs/server/Stream/Stream.cs
@@ -0,0 +1,616 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System;
+using Tsavorite.core;
+using Garnet.server.BTreeIndex;
+using Garnet.common;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Buffers.Binary;
+
+namespace Garnet.server
+{
+ public enum StreamTrimOpts
+ {
+ MAXLEN,
+ MINID,
+ NONE
+ }
+
+ public enum XADDOpts
+ {
+ NOMKSTREAM,
+ NONE
+ }
+
+ public enum ParsedStreamEntryID
+ {
+ VALID,
+ INVALID,
+ NOT_GREATER,
+ }
+
+ public class StreamObject : IDisposable
+ {
+ readonly IDevice device;
+ readonly TsavoriteLog log;
+ readonly BTree index;
+ StreamID lastId;
+ long totalEntriesAdded;
+ SingleWriterMultiReaderLock _lock;
+
+ ///
+ /// Constructor
+ ///
+ /// Directory where the log will be stored
+ /// Page size of the log used for the stream
+ public StreamObject(string logDir, long pageSize, long memorySize, int safeTailRefreshFreqMs)
+ {
+ device = logDir == null ? new NullDevice() : Devices.CreateLogDevice("streamLogs/" + logDir + "/streamLog", preallocateFile: false);
+ log = new TsavoriteLog(new TsavoriteLogSettings { LogDevice = device, PageSize = pageSize, MemorySize = memorySize, SafeTailRefreshFrequencyMs = safeTailRefreshFreqMs });
+ index = new BTree(device.SectorSize);
+ totalEntriesAdded = 0;
+ lastId = default;
+ _lock = new SingleWriterMultiReaderLock();
+ }
+
+ ///
+ /// Increment the stream ID
+ ///
+ /// carries the incremented stream id
+ public void IncrementID(ref StreamID incrementedID)
+ {
+ var originalMs = lastId.getMS();
+ var originalSeq = lastId.getSeq();
+
+ if (originalMs == long.MaxValue)
+ {
+ incrementedID = default;
+ return;
+ }
+
+ var newMs = originalMs;
+ var newSeq = originalSeq + 1;
+
+ // if seq overflows, increment timestamp and reset seq
+ if (newSeq == 0)
+ {
+ newMs += 1;
+ newSeq = 0;
+ }
+
+ incrementedID.setMS(newMs);
+ incrementedID.setSeq(newSeq);
+
+ }
+
+ ///
+ /// Generate the next stream ID
+ ///
+ /// StreamID generated
+ public unsafe void GenerateNextID(ref StreamID id)
+ {
+ ulong timestamp = (ulong)Stopwatch.GetTimestamp() / (ulong)(Stopwatch.Frequency / 1000);
+
+ // read existing timestamp in big endian format
+ var lastTs = lastId.getMS();
+ // if this is the first entry or timestamp is greater than last added entry
+ if (totalEntriesAdded == 0 || timestamp > lastTs)
+ {
+ // this will write timestamp in big endian format
+ id.setMS(timestamp);
+ id.setSeq(0);
+ return;
+ }
+ // if timestamp is same as last added entry, increment the sequence number
+ // if seq overflows, increment timestamp and reset the sequence number
+ IncrementID(ref id);
+ }
+
+ unsafe ParsedStreamEntryID parseIDString(ArgSlice idSlice, ref StreamID id)
+ {
+ // if we have to auto-generate the whole ID
+ if (*idSlice.ptr == '*' && idSlice.length == 1)
+ {
+ GenerateNextID(ref id);
+ return ParsedStreamEntryID.VALID;
+ }
+
+ var lastIdDecodedTs = lastId.getMS();
+
+ // parse user-defined ID
+ // can be of following formats:
+ // 1. ts (seq = 0)
+ // 2. ts-* (auto-generate seq number)
+ // 3. ts-seq
+
+ // last character is a *
+ if (*(idSlice.ptr + idSlice.length - 1) == '*')
+ {
+ // has to be of format ts-*, check if '-' is the preceding character
+ if (*(idSlice.ptr + idSlice.length - 2) != '-')
+ {
+ return ParsedStreamEntryID.INVALID;
+ }
+ // parse the timestamp
+ // slice the id to remove the last two characters
+ var slicedId = new ArgSlice(idSlice.ptr, idSlice.length - 2);
+ var idEnd = idSlice.ptr + idSlice.length - 2;
+ if (!RespReadUtils.ReadUlong(out ulong timestamp, ref idSlice.ptr, idEnd))
+ {
+ return ParsedStreamEntryID.INVALID;
+ }
+
+ // check if timestamp is greater than last added entry's decoded ts
+ if (totalEntriesAdded != 0 && timestamp < lastIdDecodedTs)
+ {
+ return ParsedStreamEntryID.NOT_GREATER;
+ }
+ else if (totalEntriesAdded != 0 && timestamp == lastIdDecodedTs)
+ {
+ IncrementID(ref id);
+ }
+ else
+ {
+ id.setMS(timestamp);
+ id.setSeq(0);
+ }
+ }
+ else
+ {
+ // find index of '-' in the id
+ int index = -1;
+ for (int i = 0; i < idSlice.length; i++)
+ {
+ if (*(idSlice.ptr + i) == '-')
+ {
+ index = i;
+ break;
+ }
+ }
+ // if '-' is not found, format should be just ts
+ if (index == -1)
+ {
+ if (!RespReadUtils.ReadUlong(out ulong timestamp, ref idSlice.ptr, idSlice.ptr + idSlice.length))
+ {
+ return ParsedStreamEntryID.INVALID;
+ }
+ // check if timestamp is greater than last added entry
+ if (totalEntriesAdded != 0 && timestamp < lastIdDecodedTs)
+ {
+ return ParsedStreamEntryID.NOT_GREATER;
+ }
+ else if (totalEntriesAdded != 0 && timestamp == lastIdDecodedTs)
+ {
+ IncrementID(ref id);
+ }
+ else
+ {
+ id.setMS(timestamp);
+ id.setSeq(0);
+ }
+ }
+ else
+ {
+ // parse the timestamp
+ // slice the id to remove everything after '-'
+ var slicedId = new ArgSlice(idSlice.ptr, index);
+ var slicedSeq = new ArgSlice(idSlice.ptr + index + 1, idSlice.length - index - 1);
+ if (!RespReadUtils.ReadUlong(out ulong timestamp, ref idSlice.ptr, idSlice.ptr + index))
+ {
+ return ParsedStreamEntryID.INVALID;
+ }
+ var seqBegin = idSlice.ptr + index + 1;
+ var seqEnd = idSlice.ptr + idSlice.length;
+ if (!RespReadUtils.ReadUlong(out ulong seq, ref seqBegin, seqEnd))
+ {
+ return ParsedStreamEntryID.INVALID;
+ }
+
+ if (totalEntriesAdded != 0 && timestamp < lastIdDecodedTs)
+ {
+ return ParsedStreamEntryID.NOT_GREATER;
+ }
+ else if (totalEntriesAdded != 0 && timestamp == lastIdDecodedTs)
+ {
+ if (seq <= lastId.seq)
+ {
+ return ParsedStreamEntryID.INVALID;
+ }
+ }
+ // use ID and seq given by user
+ // encode while storing
+ id.setMS(timestamp);
+ id.setSeq(seq);
+ }
+ }
+
+ return ParsedStreamEntryID.VALID;
+ }
+
+ ///
+ /// Adds an entry or item to the stream
+ ///
+ /// byte array of the entry to store in the stream
+ /// True if entry is added successfully
+ public unsafe void AddEntry(ReadOnlySpan value, int valueLength, ArgSlice idSlice, int numPairs, ref SpanByteAndMemory output, byte respProtocolVersion)
+ {
+ byte* tmpPtr = null;
+ StreamID id = default;
+ using var writer = new RespMemoryWriter(respProtocolVersion, ref output);
+ // take a lock to ensure thread safety
+ _lock.WriteLock();
+
+ try
+ {
+ var parsedIDStatus = parseIDString(idSlice, ref id);
+ if (parsedIDStatus == ParsedStreamEntryID.INVALID)
+ {
+ writer.WriteError(CmdStrings.RESP_ERR_XADD_INVALID_STREAM_ID);
+ return;
+ }
+ else if (parsedIDStatus == ParsedStreamEntryID.NOT_GREATER)
+ {
+ writer.WriteError(CmdStrings.RESP_ERR_XADD_ID_NOT_GREATER);
+ return;
+ }
+
+ // add the entry to the log
+ {
+ bool enqueueInLog = log.TryEnqueueStreamEntry(id.idBytes, sizeof(StreamID), numPairs, value, valueLength, out long retAddress);
+ if (!enqueueInLog)
+ {
+ writer.WriteNull();
+ return;
+ }
+
+ var streamValue = new Value((ulong)retAddress);
+
+ bool added = index.Insert((byte*)Unsafe.AsPointer(ref id.idBytes[0]), streamValue);
+
+ if (!added)
+ {
+ writer.WriteNull();
+ return;
+ }
+ // copy encoded ms and seq
+ lastId.ms = (id.ms);
+ lastId.seq = (id.seq);
+
+ totalEntriesAdded++;
+ // write back the decoded ID of the entry added
+ string idString = $"{id.getMS()}-{id.getSeq()}";
+ // write id as bulk string
+ writer.WriteAsciiBulkString(idString);
+ }
+ }
+ finally
+ {
+ // log.Commit();
+ _lock.WriteUnlock();
+ }
+
+ }
+
+ ///
+ /// Get current length of the stream (number of entries in the stream)
+ ///
+ /// length of stream
+ public ulong Length()
+ {
+ ulong len = 0;
+ _lock.ReadLock();
+ try
+ {
+ // get length of the stream from the index excluding tombstones
+ len = index.ValidCount;
+ }
+ finally
+ {
+ _lock.ReadUnlock();
+ }
+ return len;
+ }
+
+ ///
+ /// Deletes an entry fromt the stream
+ ///
+ /// id of the stream entry to delete
+ /// true if entry was deleted successfully
+ public unsafe bool DeleteEntry(ArgSlice idSlice)
+ {
+ // first parse the idString
+ if (!parseCompleteID(idSlice, out StreamID entryID))
+ {
+ return false;
+ }
+ bool deleted = false;
+ // take a lock to delete from the index
+ _lock.WriteLock();
+ try
+ {
+ deleted = index.Delete((byte*)Unsafe.AsPointer(ref entryID.idBytes[0]));
+ }
+ finally
+ {
+ _lock.WriteUnlock();
+ }
+ return deleted;
+ }
+
+ public bool ParseCompleteStreamIDFromString(string idString, out StreamID id)
+ {
+ id = default;
+ string[] parts = idString.Split('-');
+ if (parts.Length != 2)
+ {
+ return false;
+ }
+ if (!ulong.TryParse(parts[0], out ulong timestamp))
+ {
+ return false;
+ }
+ if (!ulong.TryParse(parts[1], out ulong seq))
+ {
+ return false;
+ }
+
+ id.setMS(timestamp);
+ id.setSeq(seq);
+ return true;
+ }
+
+ public bool ParseStreamIDFromString(string idString, out StreamID id)
+ {
+ id = default;
+ if (idString == "-" || idString == "+")
+ {
+ return false;
+ }
+ if (!idString.Contains('-'))
+ {
+
+ if (!ulong.TryParse(idString, out ulong ms))
+ {
+ return false;
+ }
+ id.setMS(ms);
+ id.setSeq(0);
+ return true;
+ }
+ return ParseCompleteStreamIDFromString(idString, out id);
+ }
+
+ ///
+ /// Read entries from the stream from given range
+ ///
+ /// start of range
+ /// end of range
+ /// threshold to scanning
+ ///
+ public unsafe void ReadRange(string min, string max, int limit, ref SpanByteAndMemory output, byte respProtocolVersion)
+ {
+ using var writer = new RespMemoryWriter(respProtocolVersion, ref output);
+ _lock.ReadLock();
+ try
+ {
+ if (index.Count() == 0)
+ {
+ return;
+ }
+
+ long startAddr, endAddr;
+ StreamID startID, endID;
+ if (min == "-")
+ {
+ byte[] idBytes = index.First().Key;
+ startID = new StreamID(idBytes);
+ }
+ else if (!ParseStreamIDFromString(min, out startID))
+ {
+ return;
+ }
+ if (max == "+")
+ {
+ byte[] idBytes = index.Last().Key;
+ endID = new StreamID(idBytes);
+ }
+ else
+ {
+ if (!ParseStreamIDFromString(max, out endID))
+ {
+ return;
+ }
+ endID.setSeq(long.MaxValue);
+ }
+
+ int count = index.Get((byte*)Unsafe.AsPointer(ref startID.idBytes[0]), (byte*)Unsafe.AsPointer(ref endID.idBytes[0]), out Value startVal, out Value endVal, out var tombstones, limit);
+ startAddr = (long)startVal.address;
+ endAddr = (long)endVal.address + 1;
+
+ byte* tmpPtr = null;
+ int tmpSize = 0;
+ long readCount = 0;
+
+ try
+ {
+ using (var iter = log.Scan(startAddr, endAddr, scanUncommitted: true))
+ {
+
+ writer.WriteArrayLength(count);
+
+ byte* e;
+ while (iter.GetNext(out var entry, out _, out long currentAddress, out long nextAddress))
+ {
+
+ var current = new Value((ulong)currentAddress);
+ // check if any tombstone t.address matches current
+ var tombstoneFound = false;
+ foreach (var tombstone in tombstones)
+ {
+ if (tombstone.address == current.address)
+ {
+ tombstoneFound = true;
+ break;
+ }
+ }
+ if (tombstoneFound)
+ {
+ continue;
+ }
+
+ var entryBytes = entry.AsSpan();
+ // check if the entry is actually one of the qualified keys
+ // parse ID for the entry which is the first 16 bytes
+ var idBytes = entryBytes.Slice(0, 16);
+ var ts = BinaryPrimitives.ReadUInt64BigEndian(idBytes.Slice(0, 8));
+ var seq = BinaryPrimitives.ReadUInt64BigEndian(idBytes.Slice(8, 8));
+
+ string idString = $"{ts}-{seq}";
+ Span numPairsBytes = entryBytes.Slice(16, 4);
+ int numPairs = BitConverter.ToInt32(numPairsBytes);
+ Span value = entryBytes.Slice(20);
+
+ // we can already write back the ID that we read
+ writer.WriteArrayLength(2);
+
+ writer.WriteAsciiBulkString(idString);
+
+ // print array length for the number of key-value pairs in the entry
+ writer.WriteArrayLength(numPairs);
+
+ // write key-value pairs
+ fixed (byte* p = value)
+ {
+ e = p;
+ int read = 0;
+ read += (int)(e - p);
+ while (value.Length - read >= 4)
+ {
+ var orig = e;
+ if (!RespReadUtils.TryReadPtrWithLengthHeader(ref tmpPtr, ref tmpSize, ref e, e + entry.Length))
+ {
+ return;
+ }
+ var o = new Span(tmpPtr, tmpSize).ToArray();
+ writer.WriteBulkString(o);
+ read += (int)(e - orig);
+ }
+ }
+ readCount++;
+ if (limit != -1 && readCount == limit)
+ {
+ break;
+ }
+ }
+ }
+ }
+ finally
+ { }
+ }
+ finally
+ {
+ _lock.ReadUnlock();
+ }
+ }
+
+ ///
+ /// Trims the stream based on the specified options.
+ ///
+ /// length or ID specifying the threshold
+ /// MAXLEN or MINID
+ /// number of keys trimmed
+ ///
+ public unsafe bool Trim(ArgSlice trimArg, StreamTrimOpts optType, out ulong entriesTrimmed, bool approximate = false)
+ {
+ uint numLeavesDeleted = 0;
+ Value headValue = default;
+ _lock.WriteLock();
+ try
+ {
+ switch (optType)
+ {
+ case StreamTrimOpts.MAXLEN:
+ if (!RespReadUtils.ReadUlong(out ulong maxLen, ref trimArg.ptr, trimArg.ptr + trimArg.length))
+ {
+ entriesTrimmed = 0;
+ return false;
+ }
+ index.TrimByLength(maxLen, out entriesTrimmed, out headValue, out var headValidKey, out numLeavesDeleted, approximate);
+ break;
+ case StreamTrimOpts.MINID:
+ if (!parseCompleteID(trimArg, out StreamID minID))
+ {
+ entriesTrimmed = 0;
+ return false;
+ }
+ index.TrimByID((byte*)Unsafe.AsPointer(ref minID.idBytes[0]), out entriesTrimmed, out headValue, out headValidKey, out numLeavesDeleted);
+ break;
+ default:
+ entriesTrimmed = 0;
+ break;
+ }
+
+ if (numLeavesDeleted == 0)
+ {
+ // didn't delete any leaf nodes so done here
+ return true;
+ }
+ // truncate log to new head
+ var newHeadAddress = (long)headValue.address;
+ log.TruncateUntil(newHeadAddress);
+ }
+ finally
+ {
+ _lock.WriteUnlock();
+ }
+ return true;
+ }
+
+
+ unsafe bool parseCompleteID(ArgSlice idSlice, out StreamID streamID)
+ {
+ streamID = default;
+ // complete ID is of the format ts-seq in input where both ts and seq are ulong
+ // find the index of '-' in the id
+ int index = -1;
+ for (int i = 0; i < idSlice.length; i++)
+ {
+ if (*(idSlice.ptr + i) == '-')
+ {
+ index = i;
+ break;
+ }
+ }
+ // parse the timestamp
+ if (!RespReadUtils.ReadUlong(out ulong timestamp, ref idSlice.ptr, idSlice.ptr + index))
+ {
+ return false;
+ }
+
+ // after reading the timestamp, the pointer will be at the '-' character
+ var seqBegin = idSlice.ptr + 1;
+ // parse the sequence number
+ if (!RespReadUtils.ReadUlong(out ulong seq, ref seqBegin, idSlice.ptr + idSlice.length - 1))
+ {
+ return false;
+ }
+ streamID.setMS(timestamp);
+ streamID.setSeq(seq);
+ return true;
+ }
+
+ ///
+ public void Dispose()
+ {
+ try
+ {
+ log.Dispose();
+ device.Dispose();
+ }
+ finally
+ { }
+ }
+ }
+}
\ No newline at end of file
diff --git a/libs/server/Stream/StreamID.cs b/libs/server/Stream/StreamID.cs
new file mode 100644
index 00000000000..a2ab0a99fb9
--- /dev/null
+++ b/libs/server/Stream/StreamID.cs
@@ -0,0 +1,64 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System;
+using System.Buffers.Binary;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace Garnet.server
+{
+ ///
+ /// Represents a GarnetStreamID, which is a 128-bit identifier for an entry in a stream.
+ ///
+ [StructLayout(LayoutKind.Explicit)]
+ public unsafe struct StreamID
+ {
+ [FieldOffset(0)]
+ public ulong ms;
+ [FieldOffset(8)]
+ public ulong seq;
+ [FieldOffset(0)]
+ public fixed byte idBytes[16];
+
+ public StreamID(ulong ms, ulong seq)
+ {
+ BinaryPrimitives.WriteUInt64BigEndian(new Span(Unsafe.AsPointer(ref this.ms), 8), ms);
+ BinaryPrimitives.WriteUInt64BigEndian(new Span(Unsafe.AsPointer(ref this.seq), 8), seq);
+ }
+ public void setMS(ulong ms)
+ {
+ BinaryPrimitives.WriteUInt64BigEndian(new Span(Unsafe.AsPointer(ref this.ms), 8), ms);
+ }
+
+ public void setSeq(ulong seq)
+ {
+ BinaryPrimitives.WriteUInt64BigEndian(new Span(Unsafe.AsPointer(ref this.seq), 8), seq);
+ }
+
+ public ulong getMS()
+ {
+ return BinaryPrimitives.ReadUInt64BigEndian(new Span(Unsafe.AsPointer(ref this.ms), 8));
+ }
+
+ public ulong getSeq()
+ {
+ return BinaryPrimitives.ReadUInt64BigEndian(new Span(Unsafe.AsPointer(ref this.seq), 8));
+ }
+
+ public unsafe StreamID(byte[] inputBytes)
+ {
+ if (inputBytes.Length != 16)
+ {
+ throw new ArgumentException("idBytes must be 16 bytes");
+ }
+
+ fixed (byte* idBytesPtr = idBytes)
+ {
+ var sourceSpan = new ReadOnlySpan(inputBytes);
+ var destinationSpan = new Span(idBytesPtr, 16);
+ sourceSpan.CopyTo(destinationSpan);
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/libs/server/Stream/StreamManager.cs b/libs/server/Stream/StreamManager.cs
new file mode 100644
index 00000000000..c2168e1d8dd
--- /dev/null
+++ b/libs/server/Stream/StreamManager.cs
@@ -0,0 +1,222 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System;
+using System.Collections.Generic;
+using Garnet.common;
+using Tsavorite.core;
+
+namespace Garnet.server
+{
+ public sealed class StreamManager : IDisposable
+ {
+ private Dictionary streams;
+ long defPageSize;
+ long defMemorySize;
+ int safeTailRefreshFreqMs;
+
+ SingleWriterMultiReaderLock _lock = new SingleWriterMultiReaderLock();
+
+ public StreamManager(long pageSize, long memorySize, int safeTailRefreshFreqMs)
+ {
+ streams = new Dictionary(ByteArrayComparer.Instance);
+ defPageSize = pageSize;
+ defMemorySize = memorySize;
+ this.safeTailRefreshFreqMs = safeTailRefreshFreqMs;
+ }
+
+ ///
+ /// Add a new entry to the stream
+ ///
+ /// key/name of the stream
+ /// id of the stream entry
+ /// if true, do not create a new stream if it does not exist
+ /// payload to the stream
+ /// length of payload to the stream
+ /// # k-v pairs in the payload
+ ///
+ /// key of last stream accessed (for cache)
+ /// reference to last stream accessed (for cache)
+ /// RESP protocol version
+ public unsafe void StreamAdd(ArgSlice keySlice, ArgSlice idSlice, bool noMkStream, ReadOnlySpan value, int valueLength, int numPairs, ref SpanByteAndMemory output, out byte[] streamKey, out StreamObject lastStream, byte respProtocolVersion)
+ {
+ // copy key store this key in the dictionary
+ byte[] key = new byte[keySlice.Length];
+ fixed (byte* keyPtr = key)
+ Buffer.MemoryCopy(keySlice.ptr, keyPtr, keySlice.Length, keySlice.Length);
+ bool foundStream = false;
+ StreamObject stream;
+ lastStream = null;
+ streamKey = null;
+ _lock.ReadLock();
+ try
+ {
+ foundStream = streams.TryGetValue(key, out stream);
+ if (foundStream)
+ {
+ stream.AddEntry(value, valueLength, idSlice, numPairs, ref output, respProtocolVersion);
+ // update last accessed stream key
+ lastStream = stream;
+ streamKey = key;
+ }
+ }
+ finally
+ {
+ _lock.ReadUnlock();
+ }
+ if (foundStream)
+ {
+ return;
+ }
+ // take a write lock
+ _lock.WriteLock();
+ try
+ {
+ // retry to validate if some other thread has created the stream
+ foundStream = streams.TryGetValue(key, out stream);
+ if (!foundStream && !noMkStream)
+ {
+ // stream was not found with this key so create a new one
+ StreamObject newStream = new StreamObject(null, defPageSize, defMemorySize, safeTailRefreshFreqMs);
+ newStream.AddEntry(value, valueLength, idSlice, numPairs, ref output, respProtocolVersion);
+ streams.TryAdd(key, newStream);
+ streamKey = key;
+ lastStream = newStream;
+ }
+ else if (!foundStream && noMkStream)
+ {
+ // stream was not found and noMkStream is set so return an error
+ using var writer = new RespMemoryWriter(respProtocolVersion, ref output);
+ writer.WriteNull();
+ return;
+ }
+ else
+ {
+ stream.AddEntry(value, valueLength, idSlice, numPairs, ref output, respProtocolVersion);
+ lastStream = stream;
+ streamKey = key;
+ }
+ }
+ finally
+ {
+ _lock.WriteUnlock();
+ }
+ return;
+ }
+
+ ///
+ /// Get the length of a particular stream
+ ///
+ /// key of the stream we want to obtain the length
+ /// length of the stream
+ public unsafe ulong StreamLength(ArgSlice keySlice)
+ {
+ var key = keySlice.ToArray();
+ if (streams != null)
+ {
+ bool foundStream = streams.TryGetValue(key, out StreamObject stream);
+ if (foundStream)
+ {
+ return stream.Length();
+ }
+ else
+ {
+ // return 0 if stream does not exist, as if it was empty
+ return 0;
+ }
+ }
+ return 0;
+ }
+
+ ///
+ /// Perform range scan in a stream
+ ///
+ /// key/name of stream
+ /// start of range
+ /// end of range
+ /// threshold to limit scanning
+ ///
+ /// RESP protocol version
+ public unsafe bool StreamRange(ArgSlice keySlice, string start, string end, int count, ref SpanByteAndMemory output, byte respProtocolVersion)
+ {
+ var key = keySlice.ToArray();
+ if (streams != null && streams.Count > 0)
+ {
+ bool foundStream = streams.TryGetValue(key, out StreamObject stream);
+ if (foundStream)
+ {
+ stream.ReadRange(start, end, count, ref output, respProtocolVersion);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ ///
+ /// Delete an entry from a stream
+ ///
+ /// key/name of stream to delete
+ /// id of stream entry to delete
+ /// last accessed stream in cache
+ ///
+ public bool StreamDelete(ArgSlice keySlice, ArgSlice idSlice, out StreamObject lastSeenStream)
+ {
+ bool foundStream;
+ var key = keySlice.ToArray();
+ StreamObject stream;
+ lastSeenStream = null;
+ if (streams != null)
+ {
+ foundStream = streams.TryGetValue(key, out stream);
+
+ if (foundStream)
+ {
+ lastSeenStream = stream;
+ return stream.DeleteEntry(idSlice);
+ }
+ }
+ return false;
+ }
+
+ public bool StreamTrim(ArgSlice keySlice, ArgSlice trimArg, StreamTrimOpts optType, out ulong validKeysRemoved, bool approximate = false)
+ {
+ bool foundStream;
+ var key = keySlice.ToArray();
+ StreamObject stream;
+ validKeysRemoved = 0;
+ if (streams != null)
+ {
+ foundStream = streams.TryGetValue(key, out stream);
+
+ if (foundStream)
+ {
+ return stream.Trim(trimArg, optType, out validKeysRemoved, approximate);
+ }
+ }
+ return true; // no keys removed so return true
+ }
+
+ ///
+ public void Dispose()
+ {
+ if (streams != null)
+ {
+ _lock.WriteLock();
+ try
+ {
+ foreach (var stream in streams.Values)
+ {
+ stream.Dispose();
+ }
+
+ streams.Clear();
+ }
+ finally
+ {
+ _lock.WriteUnlock();
+ }
+ }
+
+ }
+ }
+}
\ No newline at end of file
diff --git a/libs/storage/Tsavorite/cs/src/core/TsavoriteLog/TsavoriteLog.cs b/libs/storage/Tsavorite/cs/src/core/TsavoriteLog/TsavoriteLog.cs
index 9cacbed79a9..0f75d51e82a 100644
--- a/libs/storage/Tsavorite/cs/src/core/TsavoriteLog/TsavoriteLog.cs
+++ b/libs/storage/Tsavorite/cs/src/core/TsavoriteLog/TsavoriteLog.cs
@@ -844,6 +844,44 @@ public unsafe bool TryEnqueue(ReadOnlySpan entry, out long logicalAddress)
return true;
}
+ public unsafe bool TryEnqueueStreamEntry(byte* id, int idLength, int numPairs, ReadOnlySpan entry, int entryLength, out long logicalAddress)
+ {
+ logicalAddress = 0;
+ var length = idLength + sizeof(int) + entryLength;
+ int allocatedLength = headerSize + Align(length);
+ ValidateAllocatedLength(allocatedLength);
+
+ epoch.Resume();
+
+ if (commitNum == long.MaxValue) throw new TsavoriteException("Attempting to enqueue into a completed log");
+
+ logicalAddress = allocator.TryAllocateRetryNow(allocatedLength);
+ if (logicalAddress == 0)
+ {
+ epoch.Suspend();
+ if (cannedException != null) throw cannedException;
+ return false;
+ }
+
+ var physicalAddress = allocator.GetPhysicalAddress(logicalAddress);
+ // start writing
+ // copy the id
+ *(long*)(headerSize + physicalAddress) = *(long*)id;
+ *(long*)(headerSize + physicalAddress + 8) = *(long*)(id + sizeof(long));
+ // copy the number of pairs
+ *(int*)(headerSize + physicalAddress + idLength) = numPairs;
+ // copy the entry
+ fixed (byte* bp = &entry.GetPinnableReference())
+ Buffer.MemoryCopy(bp, (void*)(headerSize + physicalAddress + idLength + sizeof(int)), entryLength, entryLength);
+
+ SetHeader(length, (byte*)physicalAddress);
+ safeTailRefreshEntryEnqueued?.Signal();
+ epoch.Suspend();
+ if (AutoCommit) Commit();
+ return true;
+ }
+
+
///
/// Append a user-defined blittable struct header atomically to the log.
///
diff --git a/playground/BTree/Btree.csproj b/playground/BTree/Btree.csproj
new file mode 100644
index 00000000000..b4678381019
--- /dev/null
+++ b/playground/BTree/Btree.csproj
@@ -0,0 +1,18 @@
+
+
+
+ Exe
+ net8.0
+ enable
+ enable
+
+
+
+
+
+
+
+
+
+
+
diff --git a/playground/BTree/Program.cs b/playground/BTree/Program.cs
new file mode 100644
index 00000000000..6fbb87d6d24
--- /dev/null
+++ b/playground/BTree/Program.cs
@@ -0,0 +1,164 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using Garnet.server;
+using Garnet.server.BTreeIndex;
+class Program
+{
+ ///
+ /// Playground for the B+tree index implementation
+ ///
+ ///
+ ///
+ static unsafe void Main(string[] args)
+ {
+ var tree = new BTree((uint)BTreeNode.PAGE_SIZE);
+ ulong N = 50000;
+ bool verbose = true;
+ if (args.Length > 0)
+ {
+ for (int i = 0; i < args.Length; i++)
+ {
+ if (args[i] == "--verb")
+ {
+ verbose = true;
+ }
+ else if (args[i] == "-N")
+ {
+ N = ulong.Parse(args[i + 1]);
+ break;
+ }
+ }
+ }
+ StreamID[] streamIDs = new StreamID[N];
+ long duration = 0;
+ long dur2 = 0;
+ for (ulong i = 0; i < N; i++)
+ {
+ StreamID x = new StreamID(i + 1, 0);
+ Debug.Assert(x.ms > 0);
+ streamIDs[i] = x;
+ }
+ long start = Stopwatch.GetTimestamp();
+ Stopwatch sw = new Stopwatch();
+ sw.Start();
+ for (ulong i = 0; i < N; i++)
+ {
+ tree.Insert((byte*)Unsafe.AsPointer(ref streamIDs[i].idBytes[0]), new Value(i + 1));
+ var value = tree.Get((byte*)Unsafe.AsPointer(ref streamIDs[i].idBytes[0]));
+ Debug.Assert(value.address == i + 1);
+ }
+ sw.Stop();
+ dur2 = sw.ElapsedTicks;
+ duration += Stopwatch.GetTimestamp() - start;
+ Console.WriteLine(" Number of Fast Inserts = " + tree.FastInserts);
+ double nanosecondsPerTick = (1_000_000_000.0) / Stopwatch.Frequency;
+ if (verbose)
+ {
+ Console.WriteLine("Insertion done");
+ Console.WriteLine(" Number of Fast Inserts = " + tree.FastInserts);
+ Console.WriteLine("Number of Leaves = " + tree.LeafCount);
+ Console.WriteLine("Number of Internal Nodes = " + tree.InternalCount);
+ Console.WriteLine("Time for insertion = " + (double)dur2 * nanosecondsPerTick + " ns");
+ }
+ long insertion_time = (long)(dur2 * nanosecondsPerTick);
+ sw.Reset();
+
+ // point lookups
+ sw.Start();
+ for (ulong i = 0; i < N; i++)
+ {
+ var value = tree.Get((byte*)Unsafe.AsPointer(ref streamIDs[i].idBytes[0]));
+ Debug.Assert(value.address == i + 1);
+ }
+ sw.Stop();
+ long query_time = (long)(sw.ElapsedTicks * nanosecondsPerTick);
+ if (verbose)
+ {
+ Console.WriteLine("Time for querying = " + query_time + " ns");
+ }
+ sw.Reset();
+ Console.WriteLine("All inserted keys found");
+
+ // forward range query
+ double[] selectivities = [0.01, 0.05, 0.1];
+ long[] range_query_times = new long[selectivities.Length];
+ Value[] startVal = new Value[selectivities.Length];
+ Value[] endVal = new Value[selectivities.Length];
+ List[] list = new List[selectivities.Length];
+ for (int i = 0; i < selectivities.Length; i++)
+ {
+ double selectivity = selectivities[i];
+ ulong startIdx, endIdx;
+ do
+ {
+ // get a random start index from 0 to N
+ startIdx = (ulong)new Random().Next(0, (int)N);
+ endIdx = (ulong)(startIdx + (N * selectivity));
+ } while (endIdx >= N);
+ sw.Start();
+ var count = tree.Get((byte*)Unsafe.AsPointer(ref streamIDs[startIdx].idBytes[0]), (byte*)Unsafe.AsPointer(ref streamIDs[endIdx].idBytes[0]), out startVal[i], out endVal[i], out list[i]);
+ Debug.Assert(count == (int)(endIdx - startIdx + 1));
+ sw.Stop();
+ range_query_times[i] = (long)(sw.ElapsedTicks * nanosecondsPerTick);
+ if (verbose)
+ {
+ Console.WriteLine("Time for range query " + (i + 1) + " = " + range_query_times[i] + " ns");
+ }
+ sw.Reset();
+ }
+ if (verbose)
+ Console.WriteLine("Range query check passed ");
+
+ // tree.TrimByID((byte*)Unsafe.AsPointer(ref streamIDs[500].idBytes[0]), out var validKeysRemoved, out var headValue, out var headValidKey, out var numLeavesDeleted);
+ // Console.WriteLine("Trimmed by ID: validKeysRemoved = " + validKeysRemoved);
+ // Console.WriteLine("num leaves deleted = " + numLeavesDeleted);
+
+ // tree.TrimByLength(2000, out var validKeysRemoved2, out var headValue2, out var headValidKey2, out var numLeavesDeleted2);
+ // Console.WriteLine("Trimmed by length: validKeysRemoved = " + validKeysRemoved2);
+ // Console.WriteLine("num leaves deleted = " + numLeavesDeleted2);
+
+ // now let's delete some keys
+ sw.Reset();
+ int num_deletes = 100;
+ int num_successfully_deleted = 0;
+ for (int i = 0; i < num_deletes; i++)
+ {
+ // generate a random index to delete
+ int idx = new Random().Next(0, (int)N);
+ sw.Start();
+ bool val = false;
+ // bool val = tree.Delete((byte*)Unsafe.AsPointer(ref streamIDs[idx].idBytes[0]));
+ sw.Stop();
+ if (val)
+ {
+ num_successfully_deleted++;
+ }
+ }
+ long deleteTime = (long)(sw.ElapsedTicks * nanosecondsPerTick);
+ if (verbose)
+ {
+ Console.WriteLine("Number of keys deleted = " + num_successfully_deleted);
+ Console.WriteLine("Time for deletion = " + deleteTime + " ns");
+ }
+
+ tree.Delete((byte*)Unsafe.AsPointer(ref streamIDs[N - 400].idBytes[0]));
+ tree.Delete((byte*)Unsafe.AsPointer(ref streamIDs[N - 300].idBytes[0]));
+ tree.Delete((byte*)Unsafe.AsPointer(ref streamIDs[N - 200].idBytes[0]));
+ tree.Delete((byte*)Unsafe.AsPointer(ref streamIDs[N - 100].idBytes[0]));
+
+ // do a range query to check again
+ tree.Get((byte*)Unsafe.AsPointer(ref streamIDs[N - 500].idBytes[0]), (byte*)Unsafe.AsPointer(ref streamIDs[N - 1].idBytes[0]), out Value startVal1, out Value endVal1, out List tombstones);
+ Debug.Assert(tombstones.Count == 4);
+ Console.WriteLine("Delete check passed ");
+
+ // print all times collected in a csv format
+ Console.WriteLine(insertion_time + ", " + query_time + ", " + range_query_times[0] + ", " + range_query_times[1] + ", " + range_query_times[2] + ", " + deleteTime);
+ tree.Deallocate();
+ Console.WriteLine("Num allocates = " + tree.stats.numAllocates);
+ Console.WriteLine("Num deallocates = " + tree.stats.numDeallocates);
+ Console.WriteLine("All checks passed");
+ }
+}
\ No newline at end of file
diff --git a/playground/CommandInfoUpdater/SupportedCommand.cs b/playground/CommandInfoUpdater/SupportedCommand.cs
index 95edfe7a930..72531e067e9 100644
--- a/playground/CommandInfoUpdater/SupportedCommand.cs
+++ b/playground/CommandInfoUpdater/SupportedCommand.cs
@@ -311,6 +311,11 @@ public class SupportedCommand
new("WATCH", RespCommand.WATCH),
new("WATCHMS", RespCommand.WATCHMS),
new("WATCHOS", RespCommand.WATCHOS),
+ new("XADD", RespCommand.XADD),
+ new("XDEL", RespCommand.XDEL),
+ new("XLEN", RespCommand.XLEN),
+ new("XRANGE", RespCommand.XRANGE),
+ new("XTRIM", RespCommand.XTRIM),
new("ZADD", RespCommand.ZADD),
new("ZCARD", RespCommand.ZCARD),
new("ZCOUNT", RespCommand.ZCOUNT),
diff --git a/test/Garnet.test/BTreeTests.cs b/test/Garnet.test/BTreeTests.cs
new file mode 100644
index 00000000000..8854d30fe15
--- /dev/null
+++ b/test/Garnet.test/BTreeTests.cs
@@ -0,0 +1,171 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System;
+using System.Collections.Generic;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using Garnet.server;
+using Garnet.server.BTreeIndex;
+using NUnit.Framework;
+using NUnit.Framework.Legacy;
+
+namespace Garnet.test
+{
+ using Value = Value;
+
+ [TestFixture]
+ public unsafe class BTreeTests
+ {
+ static StreamID[] streamIDs;
+ static ulong N = 50000;
+
+ [SetUp]
+ public void Setup()
+ {
+ streamIDs = new StreamID[N];
+ for (ulong i = 0; i < N; i++)
+ {
+ streamIDs[i] = new StreamID(i + 1, 0);
+ }
+ }
+
+ [TearDown]
+ public void TearDown()
+ { }
+
+ [Test]
+ [Category("INIT")]
+ public void InitBTreeLeafNode()
+ {
+ // var memoryBlock = (IntPtr*)Marshal.AllocHGlobal(BTreeNode.PAGE_SIZE).ToPointer();
+ var memoryBlock = (IntPtr*)NativeMemory.AlignedAlloc((nuint)BTreeNode.PAGE_SIZE, (nuint)BTreeNode.PAGE_SIZE);
+ var leaf = BTreeNode.Create(BTreeNodeType.Leaf, memoryBlock);
+ ClassicAssert.AreEqual(leaf->info->type, BTreeNodeType.Leaf);
+ ClassicAssert.AreEqual(leaf->info->count, 0);
+
+ // free the leaf
+ BTree.FreeNode(ref leaf);
+
+ leaf = null;
+ }
+
+ [Test]
+ [Category("INSERT")]
+ public void Insert()
+ {
+ var tree = new BTree((uint)BTreeNode.PAGE_SIZE);
+ ClassicAssert.AreEqual(tree.FastInserts, 0);
+ ClassicAssert.AreEqual(tree.LeafCount, 1);
+ ClassicAssert.AreEqual(tree.InternalCount, 0);
+
+ for (ulong i = 0; i < N; i++)
+ {
+ tree.Insert((byte*)Unsafe.AsPointer(ref streamIDs[i].idBytes[0]), new Value(i + 1));
+ }
+ ClassicAssert.AreEqual(tree.FastInserts, N);
+ tree.Deallocate();
+ }
+
+ [Test]
+ [Category("LOOKUP")]
+ public void PointLookup()
+ {
+ var tree = new BTree((uint)BTreeNode.PAGE_SIZE);
+
+ for (ulong i = 0; i < N; i++)
+ {
+ tree.Insert((byte*)Unsafe.AsPointer(ref streamIDs[i].idBytes[0]), new Value(streamIDs[i].ms));
+ }
+
+ for (ulong i = 0; i < N; i++)
+ {
+ ClassicAssert.AreEqual(tree.Get((byte*)Unsafe.AsPointer(ref streamIDs[i].idBytes[0])).address, streamIDs[i].ms);
+ }
+
+ tree.Deallocate();
+ }
+
+ [Test]
+ [Category("LOOKUP")]
+ public void RangeLookup()
+ {
+ var tree = new BTree(4096);
+
+ for (ulong i = 0; i < N; i++)
+ {
+ tree.Insert((byte*)Unsafe.AsPointer(ref streamIDs[i].idBytes[0]), new Value(streamIDs[i].ms));
+ }
+
+ int count = tree.Get((byte*)Unsafe.AsPointer(ref streamIDs[N - 200].idBytes[0]), (byte*)Unsafe.AsPointer(ref streamIDs[N - 1].idBytes[0]), out Value startVal, out Value endVal, out List list);
+ ClassicAssert.AreEqual(count, N - 1 - (N - 200) + 1);
+ ClassicAssert.AreEqual(list.Count, 0);
+ ClassicAssert.AreEqual(startVal.address, streamIDs[N - 200].ms);
+ ClassicAssert.AreEqual(endVal.address, streamIDs[N - 1].ms);
+
+ tree.Deallocate();
+ }
+
+ [Test]
+ [Category("Delete")]
+ public void Delete()
+ {
+ var tree = new BTree((uint)BTreeNode.PAGE_SIZE);
+ for (ulong i = 0; i < N; i++)
+ {
+ tree.Insert((byte*)Unsafe.AsPointer(ref streamIDs[i].idBytes[0]), new Value(streamIDs[i].ms));
+ }
+
+ // delete 10% of keys at random
+ Random rand = new Random();
+ uint delCount = 0;
+ for (ulong i = 0; i < N / 10; i++)
+ {
+ ulong idx = (ulong)rand.Next(0, (int)N);
+ bool deleted = tree.Delete((byte*)Unsafe.AsPointer(ref streamIDs[idx].idBytes[0]));
+ if (deleted)
+ {
+ delCount++;
+ }
+ }
+ ClassicAssert.AreEqual(tree.ValidCount, N - delCount);
+ tree.Deallocate();
+ }
+
+ [Test]
+ [Category("Trim")]
+ public void TrimByLength()
+ {
+ var tree = new BTree((uint)BTreeNode.PAGE_SIZE);
+ for (ulong i = 0; i < N; i++)
+ {
+ tree.Insert((byte*)Unsafe.AsPointer(ref streamIDs[i].idBytes[0]), new Value(streamIDs[i].ms));
+ }
+
+ var trimLength = 5000; // trim the tree to half its size
+ tree.TrimByLength((ulong)trimLength, out var validKeysRemoved, out var headValue, out var headValidKey, out var numLeavesDeleted);
+ var validKeysRemaining = tree.RootValidCount + tree.TailValidCount;
+ ClassicAssert.GreaterOrEqual(validKeysRemaining, trimLength);
+
+ tree.Deallocate();
+ }
+
+ [Test]
+ [Category("TrimByID")]
+ public void TrimByID()
+ {
+ var tree = new BTree((uint)BTreeNode.PAGE_SIZE);
+ for (ulong i = 0; i < N; i++)
+ {
+ tree.Insert((byte*)Unsafe.AsPointer(ref streamIDs[i].idBytes[0]), new Value(streamIDs[i].ms));
+ }
+
+ var streamIDToTrim = streamIDs[N - 1000];
+ tree.TrimByID((byte*)Unsafe.AsPointer(ref streamIDToTrim.idBytes[0]), out var validKeysRemoved, out var headValue, out var headValidKey, out var numLeavesDeleted);
+ var validKeysRemaining = tree.RootValidCount + tree.TailValidCount;
+ ClassicAssert.GreaterOrEqual((ulong)validKeysRemaining, N - validKeysRemoved);
+
+ tree.Deallocate();
+ }
+ }
+}
diff --git a/test/Garnet.test/Resp/ACL/RespCommandTests.cs b/test/Garnet.test/Resp/ACL/RespCommandTests.cs
index 243d92fe97e..925707d29b5 100644
--- a/test/Garnet.test/Resp/ACL/RespCommandTests.cs
+++ b/test/Garnet.test/Resp/ACL/RespCommandTests.cs
@@ -33,7 +33,7 @@ public void Setup()
{
TestUtils.DeleteDirectory(TestUtils.MethodTestDir, wait: true);
server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir, defaultPassword: DefaultPassword,
- useAcl: true, enableLua: true, enableModuleCommand: true);
+ useAcl: true, enableLua: true, enableModuleCommand: true, enableStreams: true);
// Register custom commands so we can test ACL'ing them
ClassicAssert.IsTrue(TestUtils.TryGetCustomCommandsInfo(out respCustomCommandsInfo));
@@ -6436,6 +6436,87 @@ static async Task DoGeoSearchStoreAsync(GarnetClient client)
}
}
+ [Test]
+ public async Task XADDACLsAsync()
+ {
+ int count = 0;
+ await CheckCommandsAsync(
+ "XADD",
+ [DoXAddAsync]
+ );
+
+ async Task DoXAddAsync(GarnetClient client)
+ {
+ string val = await client.ExecuteForStringResultAsync("XADD", ["foo", "*", $"bar--{count}", "fizz"]);
+ ClassicAssert.IsNotNull(val);
+ }
+ }
+
+ [Test]
+ public async Task XLENACLsAsync()
+ {
+ await CheckCommandsAsync(
+ "XLEN",
+ [DoXLenAsync]
+ );
+
+ async Task DoXLenAsync(GarnetClient client)
+ {
+ long val = await client.ExecuteForLongResultAsync("XLEN", ["foo"]);
+ ClassicAssert.AreEqual(0, val);
+ }
+ }
+
+ [Test]
+ public async Task XRangeACLsAsync()
+ {
+ await CheckCommandsAsync(
+ "XRANGE",
+ [DoXRangeAsync]
+ );
+
+ async Task DoXRangeAsync(GarnetClient client)
+ {
+ var val = await client.ExecuteForStringArrayResultAsync("XRANGE", ["foo", "-", "+"]);
+ ClassicAssert.AreEqual(0, val.Length);
+ }
+ }
+
+ [Test]
+ public async Task XDELACLsAsync()
+ {
+ await CheckCommandsAsync(
+ "XDEL",
+ [DoXDelAsync]
+ );
+
+ async Task DoXDelAsync(GarnetClient client)
+ {
+ long val = await client.ExecuteForLongResultAsync("XDEL", ["foo", "1"]);
+ ClassicAssert.AreEqual(0, val);
+ }
+ }
+
+ [Test]
+ public async Task XTRIMACLsAsync()
+ {
+ await CheckCommandsAsync(
+ "XTRIM",
+ [DoXTrimMinIDAsync, DoXTrimMaxLenAsync]
+ );
+ async Task DoXTrimMinIDAsync(GarnetClient client)
+ {
+ long val = await client.ExecuteForLongResultAsync("XTRIM", ["foo", "MINID", "0-0"]);
+ ClassicAssert.AreEqual(0, val);
+ }
+
+ async Task DoXTrimMaxLenAsync(GarnetClient client)
+ {
+ long val = await client.ExecuteForLongResultAsync("XTRIM", ["foo", "MAXLEN", "0"]);
+ ClassicAssert.AreEqual(0, val);
+ }
+ }
+
[Test]
public async Task ZAddACLsAsync()
{
diff --git a/test/Garnet.test/RespStreamTests.cs b/test/Garnet.test/RespStreamTests.cs
new file mode 100644
index 00000000000..51e24663fd0
--- /dev/null
+++ b/test/Garnet.test/RespStreamTests.cs
@@ -0,0 +1,224 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using NUnit.Framework;
+using NUnit.Framework.Legacy;
+using StackExchange.Redis;
+
+namespace Garnet.test
+{
+ [TestFixture]
+ public class RespStreamTests
+ {
+ protected GarnetServer server;
+ const string chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+ Random random;
+ static ulong N = 5;
+
+ [SetUp]
+ public void Setup()
+ {
+ TestUtils.DeleteDirectory(TestUtils.MethodTestDir, wait: true);
+ server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir, lowMemory: true, enableStreams: true);
+ server.Start();
+ random = new Random();
+
+ // write to one stream to test for range scans
+ var streamKey = "rangeScan";
+ using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig());
+ var db = redis.GetDatabase(0);
+ for (ulong i = 0; i < N; i++)
+ {
+ var entryKey = GenerateRandomString(4); // generate random ascii string of length 4
+ var entryValue = GenerateRandomString(4); // generate random ascii string of length 4
+ var retId = db.StreamAdd(streamKey, entryKey, entryValue);
+ }
+ }
+
+ [TearDown]
+ public void TearDown()
+ {
+ server.Dispose();
+ TestUtils.DeleteDirectory(TestUtils.MethodTestDir);
+ }
+
+ public string GenerateRandomString(int length)
+ {
+ return new string(Enumerable.Repeat(chars, length)
+ .Select(s => s[random.Next(s.Length)]).ToArray());
+ }
+
+ #region STREAMIDTests
+ [Test]
+ public void StreamAddAutoGenIdTest()
+ {
+ using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig());
+ var db = redis.GetDatabase(0);
+
+ var streamKey = "add";
+ var entryKey = GenerateRandomString(4); // generate random ascii string of length 4
+ var entryValue = GenerateRandomString(4); // generate random ascii string of length 4
+ var retId = db.StreamAdd(streamKey, entryKey, entryValue);
+ ClassicAssert.IsTrue(retId.ToString().Contains("-"));
+ }
+
+ [Test]
+ public void StreamAddUserDefinedTsTest()
+ {
+ using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig());
+ var db = redis.GetDatabase(0);
+
+ var streamKey = "addTs";
+ var entryKey = GenerateRandomString(4); // generate random ascii string of length 4
+ var entryValue = GenerateRandomString(4); // generate random ascii string of length 4
+ var retId = db.StreamAdd(streamKey, entryKey, entryValue, $"{1}");
+ ClassicAssert.IsTrue(retId.ToString().Contains("-"));
+ }
+
+ [Test]
+ public void StreamAddUserDefinedIdTest()
+ {
+ using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig());
+ var db = redis.GetDatabase(0);
+
+ var streamKey = "addId";
+ var entryKey = GenerateRandomString(4); // generate random ascii string of length 4
+ var entryValue = GenerateRandomString(4); // generate random ascii string of length 4
+ var retId = db.StreamAdd(streamKey, entryKey, entryValue, $"{1}-0");
+ ClassicAssert.IsTrue(retId.ToString().Contains("-"));
+ }
+ #endregion
+
+ #region STREAMOperationsTests
+ [Test]
+ public void StreamAddAndLengthTest()
+ {
+ using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig());
+ var db = redis.GetDatabase(0);
+
+ var streamKey = "length";
+ var count = 0;
+ for (ulong i = 0; i < N; i++)
+ {
+ var entryKey = GenerateRandomString(4); // generate random ascii string of length 4
+ var entryValue = GenerateRandomString(4); // generate random ascii string of length 4
+ var retId = db.StreamAdd(streamKey, entryKey, entryValue);
+ count++;
+ }
+ ClassicAssert.AreEqual(count, N);
+
+ var length = db.StreamLength(streamKey);
+ ClassicAssert.AreEqual(length, N);
+ }
+
+ [Test]
+ public void StreamRangeExistingTest()
+ {
+ using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig());
+ var db = redis.GetDatabase(0);
+ var streamKey = "rangeScan";
+ var range = db.StreamRange(streamKey, "-", "+");
+ ClassicAssert.AreEqual(range.Length, N);
+ }
+
+ [Test]
+ public void StreamRangeNonExistingTest()
+ {
+ using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig());
+ var db = redis.GetDatabase(0);
+ var streamKey = "nonExistingRangeScan";
+ var range = db.StreamRange(streamKey, "-", "+");
+ ClassicAssert.AreEqual(range.Length, 0);
+ }
+
+ [Test]
+ public void StreamRangeWithCountTest()
+ {
+ using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig());
+ var db = redis.GetDatabase(0);
+ var streamKey = "rangeScan";
+ int limit = 2;
+ var range = db.StreamRange(streamKey, "-", "+", limit);
+ ClassicAssert.AreEqual(range.Length, limit);
+ }
+
+ [Test]
+ public void StreamDeleteSingleTest()
+ {
+ using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig());
+ var db = redis.GetDatabase(0);
+
+ var streamKey = "delOne";
+ var entryKey = GenerateRandomString(4); // generate random ascii string of length 4
+ var entryValue = GenerateRandomString(4); // generate random ascii string of length 4
+ var retId = db.StreamAdd(streamKey, entryKey, entryValue, $"{1}-0");
+
+ var delCount = db.StreamDelete(streamKey, [retId]);
+ ClassicAssert.AreEqual(delCount, 1);
+ }
+
+ [Test]
+ [Category("Delete")]
+ public void StreamDeleteMultipleTest()
+ {
+ using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig());
+ var db = redis.GetDatabase(0);
+
+ var streamKey = "delMultiple";
+ var count = 0;
+ for (ulong i = 0; i < N; i++)
+ {
+ var entryKey = GenerateRandomString(4); // generate random ascii string of length 4
+ var entryValue = GenerateRandomString(4); // generate random ascii string of length 4
+ var retId = db.StreamAdd(streamKey, entryKey, entryValue, $"{i + 1}-0");
+ count++;
+ }
+ ClassicAssert.AreEqual(count, N);
+
+ // Pick arbitrary 2 unique indices between 0 and N and store each index in a set
+ int numToDelete = 2;
+ var indices = new HashSet();
+ while (indices.Count < numToDelete)
+ {
+ indices.Add(random.Next(0, (int)N));
+ }
+
+ var eIds = new RedisValue[numToDelete];
+ int c = 0;
+ foreach (var idx in indices)
+ {
+ eIds[c++] = $"{idx + 1}-0";
+ }
+
+ var delCount = db.StreamDelete(streamKey, eIds);
+ ClassicAssert.AreEqual(delCount, indices.Count);
+ }
+
+ [Test]
+ [Category("Trim")]
+ public void StreamTrimMaxLenTest()
+ {
+ using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig());
+ var db = redis.GetDatabase(0);
+
+ var streamKey = "trimByMaxLen";
+ long count = 500;
+ for (long i = 0; i < count; i++)
+ {
+ var entryKey = GenerateRandomString(4); // generate random ascii string of length 4
+ var entryValue = GenerateRandomString(4); // generate random ascii string of length 4
+ var retId = db.StreamAdd(streamKey, entryKey, entryValue, $"{i + 1}-0");
+ }
+ var maxLen = 100;
+ var trimCount = db.StreamTrim(streamKey, maxLen);
+ ClassicAssert.GreaterOrEqual(trimCount, 1);
+ ClassicAssert.GreaterOrEqual(count - trimCount, maxLen);
+ }
+
+
+ #endregion
+ }
+}
\ No newline at end of file
diff --git a/test/Garnet.test/TestUtils.cs b/test/Garnet.test/TestUtils.cs
index 25a8fb0bf8f..173ab967420 100644
--- a/test/Garnet.test/TestUtils.cs
+++ b/test/Garnet.test/TestUtils.cs
@@ -266,6 +266,7 @@ public static GarnetServer CreateGarnetServer(
int slowLogThreshold = 0,
TextWriter logTo = null,
bool enableCluster = false,
+ bool enableStreams = false,
int expiredKeyDeletionScanFrequencySecs = -1,
bool useReviv = false
)
@@ -355,6 +356,7 @@ public static GarnetServer CreateGarnetServer(
UnixSocketPermission = unixSocketPermission,
SlowLogThreshold = slowLogThreshold,
ExpiredKeyDeletionScanFrequencySecs = expiredKeyDeletionScanFrequencySecs,
+ EnableStreams = enableStreams,
};
if (!string.IsNullOrEmpty(pubSubPageSize))
diff --git a/website/docs/commands/api-compatibility.md b/website/docs/commands/api-compatibility.md
index f6dce718fb3..329198489a3 100644
--- a/website/docs/commands/api-compatibility.md
+++ b/website/docs/commands/api-compatibility.md
@@ -354,10 +354,10 @@ Note that this list is subject to change as we continue to expand our API comman
| | [ZUNION](data-structures.md#zunion) | ➕ | |
| | [ZUNIONSTORE](data-structures.md#zunionstore) | ➕ | |
| **STREAM** | XACK | ➖ | |
-| | XADD | ➖ | |
+| | XADD | ➕ | (Does not support Capped Streams) |
| | XAUTOCLAIM | ➖ | |
| | XCLAIM | ➖ | |
-| | XDEL | ➖ | |
+| | XDEL | ➕ | |
| | XGROUP CREATE | ➖ | |
| | XGROUP CREATECONSUMER | ➖ | |
| | XGROUP DELCONSUMER | ➖ | |
@@ -368,14 +368,14 @@ Note that this list is subject to change as we continue to expand our API comman
| | XINFO GROUPS | ➖ | |
| | XINFO HELP | ➖ | |
| | XINFO STREAM | ➖ | |
-| | XLEN | ➖ | |
+| | XLEN | ➕ | |
| | XPENDING | ➖ | |
-| | XRANGE | ➖ | |
+| | XRANGE | ➕ | |
| | XREAD | ➖ | |
| | XREADGROUP | ➖ | |
| | XREVRANGE | ➖ | |
| | XSETID | ➖ | |
-| | XTRIM | ➖ | |
+| | XTRIM | ➕ | Does not support near-exact trimming |
| **STRING** | [APPEND](raw-string.md#append) | ➕ | |
| | [DECR](raw-string.md#decr) | ➕ | |
| | [DECRBY](raw-string.md#decrby) | ➕ | |
diff --git a/website/docs/commands/data-structures.md b/website/docs/commands/data-structures.md
index deb459d6ed4..5ce028ee55c 100644
--- a/website/docs/commands/data-structures.md
+++ b/website/docs/commands/data-structures.md
@@ -980,6 +980,80 @@ If **destination** already exists, it is overwritten.
---
+## Stream
+
+### XADD
+
+#### Syntax
+
+```bash
+ XADD key [NOMKSTREAM] <* | id> field value [field value ...]
+```
+Appends given stream entry to the stream at specified key. If the key does not exist, it is created when running the command.
+Creation of the stream can be disabled with the `NOMKSTREAM` option.
+
+Every entry in the stream is accompanied by a stream entry ID and consists of field-value pairs that are stored/read in the same order as provided by the user.
+While the [XADD](#XADD) can auto-generate a unique ID using the `*` character, it is also possible to specify a user-defined ID specified by two 64-bit numbers separated by a `-` character.
+The IDs are guaranteed to be incremental.
+
+**Capped Streams** are not currently supported.
+
+---
+
+### XLEN
+
+#### Syntax
+
+```bash
+ XLEN key
+```
+Returns the number of entries inside the stream specified by `key`. If the stream does not exist, returns 0.
+
+---
+
+### XRANGE
+
+#### Syntax
+
+```bash
+ XRANGE key start end [COUNT count]
+```
+Returns stream entries matching a given range of IDs.
+`start` and `end` can be special IDs (i.e, `-` and `+`) to specify the minimum possible ID and the maximum possible ID inside a stream respectively.
+The IDs provided can also be incomplete (i.e., with only the first part of the ID).
+Using the `COUNT` option reduces the number of entries returned.
+
+---
+
+### XDEL
+
+#### Syntax
+
+```bash
+ XDEL key id [id ...]
+```
+Removes the specified entries from a stream given by key, and returns the number of entries deleted.
+If speficied IDs do not exist, the number of entries returned may be less than the number of IDs provided as they are not counted as deleted.
+
+---
+
+### XTRIM
+
+#### Syntax
+
+```bash
+ XTRIM key threshold
+```
+Trims the stream by evicting older entries using two strategies:
+
+- MAXLEN: evicts entries as long as stream's length exceeds specified threshold.
+- MINID: evicts entries with IDs lower than threshold where `threshold` is an entry ID.
+
+`LIMIT` clause is not currently supported.
+`MINID` defaults to exact trimming, meaning all entries having IDs lower than threshold will be deleted.
+
+---
+
## Sorted Set
### ZADD