Skip to content

[Issue 1191] Handle free list for revivification in RMW for expiration/deletes #1189

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 10 additions & 12 deletions libs/server/Storage/Functions/MainStore/RMWMethods.cs
Original file line number Diff line number Diff line change
Expand Up @@ -911,14 +911,19 @@ public bool NeedCopyUpdate(ref SpanByte key, ref RawStringInput input, ref SpanB
case RespCommand.DELIFGREATER:
if (rmwInfo.RecordInfo.ETag)
EtagState.SetValsForRecordWithEtag(ref functionsState.etagState, ref oldValue);

long etagFromClient = input.parseState.GetLong(0);
if (etagFromClient <= functionsState.etagState.etag)
if (etagFromClient > functionsState.etagState.etag)
{
EtagState.ResetState(ref functionsState.etagState);
return false;
rmwInfo.Action = RMWAction.ExpireAndStop;
}

return true;
EtagState.ResetState(ref functionsState.etagState);
// We always return false because we would rather not create a new record in hybrid log if we don't need to delete the object.
// Setting no Action and returning false for non-delete case will shortcircuit the InternalRMW code to not run CU, and return SUCCESS.
// If we want to delete the object setting the Action to ExpireAndStop will add the tombstone in hybrid log for us.
return false;

case RespCommand.SETIFGREATER:
case RespCommand.SETIFMATCH:
if (rmwInfo.RecordInfo.ETag)
Expand Down Expand Up @@ -1046,13 +1051,6 @@ public bool CopyUpdater(ref SpanByte key, ref RawStringInput input, ref SpanByte

switch (cmd)
{
case RespCommand.DELIFGREATER:
// NCU has already checked for making sure the etag is greater than the existing etag by this point
long etagFromClient = input.parseState.GetLong(0);
rmwInfo.Action = RMWAction.ExpireAndStop;
EtagState.ResetState(ref functionsState.etagState);
return false;

case RespCommand.SETIFGREATER:
case RespCommand.SETIFMATCH:
// By now the comparison for etag against existing etag has already been done in NeedCopyUpdate
Expand All @@ -1074,7 +1072,7 @@ public bool CopyUpdater(ref SpanByte key, ref RawStringInput input, ref SpanByte
Span<byte> dest = newValue.AsSpan(EtagConstants.EtagSize);
src.CopyTo(dest);

etagFromClient = input.parseState.GetLong(1);
long etagFromClient = input.parseState.GetLong(1);

functionsState.etagState.etag = etagFromClient;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ public readonly (int actualSize, int allocatedSize, int keySize) GetRMWCopyDesti
where TVariableLengthInput : IVariableLengthInput<TValue, TInput>
=> BlittableAllocatorImpl<TKey, TValue, TStoreFunctions>.GetRMWCopyDestinationRecordSize(ref key, ref input, ref value, ref recordInfo, varlenInput);

/// <inheritdoc/>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public (int actualSize, int allocatedSize, int keySize) GetTombstoneRecordSize(ref TKey key)
=> BlittableAllocatorImpl<TKey, TValue, TStoreFunctions>.GetTombstoneRecordSize(ref key);

/// <inheritdoc/>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly int GetRequiredRecordSize(long physicalAddress, int availableBytes) => GetAverageRecordSize();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,10 @@ void ReturnPage(int index)
internal static (int actualSize, int allocatedSize, int keySize) GetRMWCopyDestinationRecordSize<TInput, TVariableLengthInput>(ref TKey key, ref TInput input, ref TValue value, ref RecordInfo recordInfo, TVariableLengthInput varlenInput)
=> (RecordSize, RecordSize, KeySize);

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static (int actualSize, int allocatedSize, int keySize) GetTombstoneRecordSize(ref TKey key)
=> (RecordSize, RecordSize, KeySize);

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static (int actualSize, int allocatedSize, int keySize) GetRMWInitialRecordSize<TInput, TSessionFunctionsWrapper>(ref TKey key, ref TInput input, TSessionFunctionsWrapper sessionFunctions)
=> (RecordSize, RecordSize, KeySize);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ public readonly (int actualSize, int allocatedSize, int keySize) GetRMWCopyDesti
where TVariableLengthInput : IVariableLengthInput<TValue, TInput>
=> _this.GetRMWCopyDestinationRecordSize(ref key, ref input, ref value, ref recordInfo, varlenInput);

/// <inheritdoc />
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public (int actualSize, int allocatedSize, int keySize) GetTombstoneRecordSize(ref TKey key)
=> _this.GetTombstoneRecordSize(ref key);

/// <inheritdoc/>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly int GetRequiredRecordSize(long physicalAddress, int availableBytes) => GetAverageRecordSize();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@ internal ref TValue GetValue(long physicalAddress)
internal (int actualSize, int allocatedSize, int keySize) GetRMWCopyDestinationRecordSize<TInput, TVariableLengthInput>(ref TKey key, ref TInput input, ref TValue value, ref RecordInfo recordInfo, TVariableLengthInput varlenInput)
=> (RecordSize, RecordSize, KeySize);

internal (int actualSize, int allocatedSize, int keySize) GetTombstoneRecordSize(ref TKey key)
=> (RecordSize, RecordSize, KeySize);

internal int GetAverageRecordSize() => RecordSize;

internal int GetFixedRecordSize() => RecordSize;
Expand Down
3 changes: 3 additions & 0 deletions libs/storage/Tsavorite/cs/src/core/Allocator/IAllocator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ AllocatorBase<TKey, TValue, TStoreFunctions, TAllocator> GetBase<TAllocator>()
/// <summary>Get record size required for the given <paramref name="key"/> and <paramref name="value"/></summary>
(int actualSize, int allocatedSize, int keySize) GetRecordSize(ref TKey key, ref TValue value);

/// <summary>Get the record size for a tombstoned record</summary>
(int actualSize, int allocatedSize, int keySize) GetTombstoneRecordSize(ref TKey key);

/// <summary>Get the size of the given <paramref name="value"/></summary>
int GetValueLength(ref TValue value);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ public readonly (int actualSize, int allocatedSize, int keySize) GetRMWCopyDesti
where TVariableLengthInput : IVariableLengthInput<SpanByte, TInput>
=> _this.GetRMWCopyDestinationRecordSize(ref key, ref input, ref value, ref recordInfo, varlenInput);

/// <inheritdoc/>
public (int actualSize, int allocatedSize, int keySize) GetTombstoneRecordSize(ref SpanByte key) => _this.GetTombstoneRecordSize(ref key);

/// <inheritdoc/>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly int GetRequiredRecordSize(long physicalAddress, int availableBytes) => _this.GetRequiredRecordSize(physicalAddress, availableBytes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,16 @@ public ref SpanByte GetAndInitializeValue(long physicalAddress, long endAddress)
return (size, RoundUp(size, Constants.kRecordAlignment), keySize);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal (int actualSize, int allocatedSize, int keySize) GetTombstoneRecordSize(ref SpanByte key)
{
int keySize = key.TotalSize;
// Only metadata space needed since this is going to be used for tombstoning anyway.
int minAllocationForTombstone = sizeof(int);
int size = RecordInfo.GetLength() + RoundUp(keySize, Constants.kRecordAlignment) + minAllocationForTombstone;
return (size, RoundUp(size, Constants.kRecordAlignment), keySize);
}

public int GetRequiredRecordSize(long physicalAddress, int availableBytes)
{
// We need at least [average record size]...
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ public bool PostCopyUpdater(ref TKey key, ref TInput input, ref TValue oldValue,
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool InPlaceUpdater(long physicalAddress, ref TKey key, ref TInput input, ref TValue value, ref TOutput output, ref RMWInfo rmwInfo, out OperationStatus status, ref RecordInfo recordInfo)
{
(rmwInfo.UsedValueLength, rmwInfo.FullValueLength, _) = _clientSession.store.GetRecordLengths(physicalAddress, ref value, ref recordInfo);
(rmwInfo.UsedValueLength, rmwInfo.FullValueLength, rmwInfo.FullRecordLength) = _clientSession.store.GetRecordLengths(physicalAddress, ref value, ref recordInfo);

if (_clientSession.functions.InPlaceUpdater(ref key, ref input, ref value, ref output, ref rmwInfo, ref recordInfo))
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,8 @@ public struct RMWInfo
/// </summary>
public int FullValueLength { get; internal set; }

public int FullRecordLength { get; internal set; }

/// <summary>
/// If set true by CopyUpdater, the source record for the RCU will not be elided from the tag chain even if this is otherwise possible.
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -253,5 +253,57 @@ private bool FindTagAndTryTransientSLock<TInput, TOutput, TContext, TSessionFunc
stackCtx.SetRecordSourceToHashEntry(hlogBase);
return true;
}


// Note: We do not currently consider this reuse for mid-chain records (records past the HashBucket), because TracebackForKeyMatch would need
// to return the next-higher record whose .PreviousAddress points to this one, *and* we'd need to make sure that record was not revivified out.
// Also, we do not consider this in-chain reuse for records with different keys, because we don't get here if the keys don't match.
private void HandleRecordElision<TInput, TOutput, TContext, TSessionFunctionsWrapper>(
TSessionFunctionsWrapper sessionFunctions, ref OperationStackContext<TKey, TValue, TStoreFunctions, TAllocator> stackCtx, ref RecordInfo srcRecordInfo, int usedValueLength, int fullValueLength, int fullRecordLength)
where TSessionFunctionsWrapper : ISessionFunctionsWrapper<TKey, TValue, TInput, TOutput, TContext, TStoreFunctions, TAllocator>
{
if (!RevivificationManager.IsEnabled)
{
// We are not doing revivification, so we just want to remove the record from the tag chain so we don't potentially do an IO later for key
// traceback. If we succeed, we need to SealAndInvalidate. It's fine if we don't succeed here; this is just tidying up the HashBucket.
if (stackCtx.hei.TryElide())
srcRecordInfo.SealAndInvalidate();
}
else if (RevivificationManager.UseFreeRecordPool)
{
// For non-FreeRecordPool revivification, we leave the record in as a normal tombstone so we can revivify it in the chain for the same key.
// For FreeRecord Pool we must first Seal here, even if we're using the LockTable, because the Sealed state must survive this Delete() call.
// We invalidate it also for checkpoint/recovery consistency (this removes Sealed bit so Scan would enumerate records that are not in any
// tag chain--they would be in the freelist if the freelist survived Recovery), but we restore the Valid bit if it is returned to the chain,
// which due to epoch protection is guaranteed to be done before the record can be written to disk and violate the "No Invalid records in
// tag chain" invariant.
srcRecordInfo.SealAndInvalidate();

bool isElided = false, isAdded = false;

Debug.Assert(stackCtx.recSrc.LogicalAddress < hlogBase.ReadOnlyAddress || srcRecordInfo.Tombstone, $"Unexpected loss of Tombstone; Record should have been XLocked or SealInvalidated. RecordInfo: {srcRecordInfo.ToString()}");

(isElided, isAdded) = TryElideAndTransferToFreeList<TInput, TOutput, TContext, TSessionFunctionsWrapper>(sessionFunctions, ref stackCtx, ref srcRecordInfo,
(usedValueLength, fullValueLength, fullRecordLength));

if (!isElided)
{
// Leave this in the chain as a normal Tombstone; we aren't going to add a new record so we can't leave this one sealed.
srcRecordInfo.UnsealAndValidate();
}
else if (!isAdded && RevivificationManager.restoreDeletedRecordsIfBinIsFull)
{
// The record was not added to the freelist, but was elided. See if we can put it back in as a normal Tombstone. Since we just
// elided it and the elision criteria is that it is the only above-BeginAddress record in the chain, and elision sets the
// HashBucketEntry.word to 0, it means we do not expect any records for this key's tag to exist after the elision. Therefore,
// we can re-insert the record iff the HashBucketEntry's address is <= kTempInvalidAddress.
stackCtx.hei = new(stackCtx.hei.hash);
FindOrCreateTag(ref stackCtx.hei, hlogBase.BeginAddress);

if (stackCtx.hei.entry.Address <= Constants.kTempInvalidAddress && stackCtx.hei.TryCAS(stackCtx.recSrc.LogicalAddress))
srcRecordInfo.UnsealAndValidate();
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -119,51 +119,10 @@ internal OperationStatus InternalDelete<TInput, TOutput, TContext, TSessionFunct

// Try to transfer the record from the tag chain to the free record pool iff previous address points to invalid address.
// Otherwise an earlier record for this key could be reachable again.
// Note: We do not currently consider this reuse for mid-chain records (records past the HashBucket), because TracebackForKeyMatch would need
// to return the next-higher record whose .PreviousAddress points to this one, *and* we'd need to make sure that record was not revivified out.
// Also, we do not consider this in-chain reuse for records with different keys, because we don't get here if the keys don't match.
if (CanElide<TInput, TOutput, TContext, TSessionFunctionsWrapper>(sessionFunctions, ref stackCtx, ref srcRecordInfo))
{
if (!RevivificationManager.IsEnabled)
{
// We are not doing revivification, so we just want to remove the record from the tag chain so we don't potentially do an IO later for key
// traceback. If we succeed, we need to SealAndInvalidate. It's fine if we don't succeed here; this is just tidying up the HashBucket.
if (stackCtx.hei.TryElide())
srcRecordInfo.SealAndInvalidate();
}
else if (RevivificationManager.UseFreeRecordPool)
{
// For non-FreeRecordPool revivification, we leave the record in as a normal tombstone so we can revivify it in the chain for the same key.
// For FreeRecord Pool we must first Seal here, even if we're using the LockTable, because the Sealed state must survive this Delete() call.
// We invalidate it also for checkpoint/recovery consistency (this removes Sealed bit so Scan would enumerate records that are not in any
// tag chain--they would be in the freelist if the freelist survived Recovery), but we restore the Valid bit if it is returned to the chain,
// which due to epoch protection is guaranteed to be done before the record can be written to disk and violate the "No Invalid records in
// tag chain" invariant.
srcRecordInfo.SealAndInvalidate();

bool isElided = false, isAdded = false;
Debug.Assert(srcRecordInfo.Tombstone, $"Unexpected loss of Tombstone; Record should have been XLocked or SealInvalidated. RecordInfo: {srcRecordInfo.ToString()}");
(isElided, isAdded) = TryElideAndTransferToFreeList<TInput, TOutput, TContext, TSessionFunctionsWrapper>(sessionFunctions, ref stackCtx, ref srcRecordInfo,
(deleteInfo.UsedValueLength, deleteInfo.FullValueLength, fullRecordLength));

if (!isElided)
{
// Leave this in the chain as a normal Tombstone; we aren't going to add a new record so we can't leave this one sealed.
srcRecordInfo.UnsealAndValidate();
}
else if (!isAdded && RevivificationManager.restoreDeletedRecordsIfBinIsFull)
{
// The record was not added to the freelist, but was elided. See if we can put it back in as a normal Tombstone. Since we just
// elided it and the elision criteria is that it is the only above-BeginAddress record in the chain, and elision sets the
// HashBucketEntry.word to 0, it means we do not expect any records for this key's tag to exist after the elision. Therefore,
// we can re-insert the record iff the HashBucketEntry's address is <= kTempInvalidAddress.
stackCtx.hei = new(stackCtx.hei.hash);
FindOrCreateTag(ref stackCtx.hei, hlogBase.BeginAddress);

if (stackCtx.hei.entry.Address <= Constants.kTempInvalidAddress && stackCtx.hei.TryCAS(stackCtx.recSrc.LogicalAddress))
srcRecordInfo.UnsealAndValidate();
}
}
HandleRecordElision<TInput, TOutput, TContext, TSessionFunctionsWrapper>(
sessionFunctions, ref stackCtx, ref srcRecordInfo, deleteInfo.UsedValueLength, deleteInfo.FullValueLength, fullRecordLength);
}

status = OperationStatusUtils.AdvancedOpCode(OperationStatus.SUCCESS, StatusCode.InPlaceUpdatedRecord);
Expand Down
Loading