From 03cfb70631bcf4082f033a6ce79ad07cf38f37b7 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 24 Oct 2025 12:04:03 -0700 Subject: [PATCH 1/7] Add TTL values for client caches of key locations So the client may automatically remove stale entries, e.g., storage server has since changed IP addresses. --- fdbclient/ClientKnobs.cpp | 4 + fdbclient/DatabaseContext.actor.cpp | 1580 +++++++++++++++++ fdbclient/NativeAPI.actor.cpp | 14 +- fdbclient/include/fdbclient/ClientKnobs.h | 6 + fdbclient/include/fdbclient/DatabaseContext.h | 13 +- 5 files changed, 1612 insertions(+), 5 deletions(-) create mode 100644 fdbclient/DatabaseContext.actor.cpp diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index b2b43012ce9..4d2e2a966cd 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -97,6 +97,10 @@ void ClientKnobs::initialize(Randomize randomize) { init( LOCATION_CACHE_EVICTION_SIZE_SIM, 10 ); if( randomize && BUGGIFY ) LOCATION_CACHE_EVICTION_SIZE_SIM = 3; init( LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD, 60 ); init( LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL, 60 ); + // TTL disabled by default to preserve existing behavior; set > 0 to enable + init( LOCATION_CACHE_ENTRY_TTL, 0.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_TTL = deterministicRandom()->randomInt(10, 60); + // When cache entry is used, extend its expiration by this amount (sliding window) + init( LOCATION_CACHE_ENTRY_REFRESH_TIME, 300.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_REFRESH_TIME = deterministicRandom()->randomInt(10, 60); init( GET_RANGE_SHARD_LIMIT, 2 ); init( WARM_RANGE_SHARD_LIMIT, 100 ); diff --git a/fdbclient/DatabaseContext.actor.cpp b/fdbclient/DatabaseContext.actor.cpp new file mode 100644 index 00000000000..e68f2bcc9c7 --- /dev/null +++ b/fdbclient/DatabaseContext.actor.cpp @@ -0,0 +1,1580 @@ +/* + * DatabaseContext.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2024 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// TODO: prune down the list of includes. This was copied from NativeAPI.actor.cpp. +#include "fdbclient/NativeAPI.actor.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "boost/algorithm/string.hpp" + +#include "fdbclient/Knobs.h" +#include "flow/CodeProbe.h" +#include "fmt/format.h" + +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/FDBTypes.h" +#include "fdbrpc/FailureMonitor.h" +#include "fdbrpc/MultiInterface.h" +#include "fdbrpc/TenantInfo.h" + +#include "fdbclient/ActorLineageProfiler.h" +#include "fdbclient/AnnotateActor.h" +#include "fdbclient/Atomic.h" +#include "fdbclient/ClusterInterface.h" +#include "fdbclient/ClusterConnectionFile.h" +#include "fdbclient/ClusterConnectionMemoryRecord.h" +#include "fdbclient/CoordinationInterface.h" +#include "fdbclient/CommitTransaction.h" +#include "fdbclient/DatabaseContext.h" +#include "fdbclient/GlobalConfig.actor.h" +#include "fdbclient/IKnobCollection.h" +#include "fdbclient/JsonBuilder.h" +#include "fdbclient/KeyBackedTypes.actor.h" +#include "fdbclient/KeyRangeMap.h" +#include "fdbclient/ManagementAPI.actor.h" +#include "fdbclient/NameLineage.h" +#include "fdbclient/CommitProxyInterface.h" +#include "fdbclient/MonitorLeader.h" +#include "fdbclient/MutationList.h" +#include "fdbclient/ParallelStream.actor.h" +#include "fdbclient/ReadYourWrites.h" +#include "fdbclient/SpecialKeySpace.actor.h" +#include "fdbclient/StorageServerInterface.h" +#include "fdbclient/SystemData.h" +#include "fdbclient/Tenant.h" +#include "fdbclient/TenantSpecialKeys.actor.h" +#include "fdbclient/TransactionLineage.h" +#include "fdbclient/versions.h" +#include "fdbrpc/WellKnownEndpoints.h" +#include "fdbrpc/LoadBalance.h" +#include "fdbrpc/Net2FileSystem.h" +#include "fdbrpc/simulator.h" +#include "fdbrpc/sim_validation.h" +#include "flow/Arena.h" +#include "flow/ActorCollection.h" +#include "flow/DeterministicRandom.h" +#include "flow/Error.h" +#include "flow/FastRef.h" +#include "flow/GetSourceVersion.h" +#include "flow/IRandom.h" +#include "flow/Trace.h" +#include "flow/ProtocolVersion.h" +#include "flow/flow.h" +#include "flow/genericactors.actor.h" +#include "flow/Knobs.h" +#include "flow/Platform.h" +#include "flow/SystemMonitor.h" +#include "flow/TLSConfig.actor.h" +#include "fdbclient/Tracing.h" +#include "flow/UnitTest.h" +#include "flow/network.h" +#include "flow/serialize.h" + +#ifdef ADDRESS_SANITIZER +#include +#endif + +#ifdef WIN32 +#define WIN32_LEAN_AND_MEAN +#include +#undef min +#undef max +#else +#include +#endif +#include "flow/actorcompiler.h" // This must be the last #include. + +Reference DatabaseContext::getWatchMetadata(int64_t tenantId, KeyRef key) const { + const auto it = watchMap.find(std::make_pair(tenantId, key)); + if (it == watchMap.end()) + return Reference(); + return it->second; +} + +void DatabaseContext::setWatchMetadata(Reference metadata) { + const WatchMapKey key(metadata->parameters->tenant.tenantId, metadata->parameters->key); + watchMap[key] = metadata; + // NOTE Here we do *NOT* update/reset the reference count for the key, see the source code in getWatchFuture. + // Basically the reference count could be increased, or the same watch is refreshed, or the watch might be cancelled +} + +int32_t DatabaseContext::increaseWatchRefCount(const int64_t tenantID, KeyRef key, const Version& version) { + const WatchMapKey mapKey(tenantID, key); + watchCounterMap[mapKey].insert(version); + return watchCounterMap[mapKey].size(); +} + +int32_t DatabaseContext::decreaseWatchRefCount(const int64_t tenantID, KeyRef key, const Version& version) { + const WatchMapKey mapKey(tenantID, key); + auto mapKeyIter = watchCounterMap.find(mapKey); + if (mapKeyIter == std::end(watchCounterMap)) { + // Key does not exist. The metadata might be removed by deleteWatchMetadata already. + return 0; + } + + auto& versionSet = mapKeyIter->second; + auto versionIter = versionSet.find(version); + + if (versionIter == std::end(versionSet)) { + // Version not found, the watch might be cleared before. + return versionSet.size(); + } + versionSet.erase(versionIter); + + const auto count = versionSet.size(); + // The metadata might be deleted somewhere else, before calling this decreaseWatchRefCount + if (auto metadata = getWatchMetadata(tenantID, key); metadata.isValid() && versionSet.size() == 0) { + // It is a *must* to cancel the watchFutureSS manually. watchFutureSS waits for watchStorageServerResp, which + // holds a reference to the metadata. If the ACTOR is not cancelled, it indirectly holds a Future waiting for + // itself. + metadata->watchFutureSS.cancel(); + deleteWatchMetadata(tenantID, key); + } + + return count; +} + +void DatabaseContext::deleteWatchMetadata(int64_t tenantId, KeyRef key, bool removeReferenceCount) { + const WatchMapKey mapKey(tenantId, key); + watchMap.erase(mapKey); + if (removeReferenceCount) { + watchCounterMap.erase(mapKey); + } +} + +void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi) { + auto result = tssMapping.find(ssi.id()); + // Update tss endpoint mapping if ss isn't in mapping, or the interface it mapped to changed + if (result == tssMapping.end() || + result->second.getValue.getEndpoint().token.first() != tssi.getValue.getEndpoint().token.first()) { + Reference metrics; + if (result == tssMapping.end()) { + // new TSS pairing + metrics = makeReference(); + tssMetrics[tssi.id()] = metrics; + tssMapping[ssi.id()] = tssi; + } else { + ASSERT(result->second.id() == tssi.id()); + metrics = tssMetrics[tssi.id()]; + result->second = tssi; + } + + // data requests duplicated for load and data comparison + queueModel.updateTssEndpoint(ssi.getValue.getEndpoint().token.first(), + TSSEndpointData(tssi.id(), tssi.getValue.getEndpoint(), metrics)); + queueModel.updateTssEndpoint(ssi.getKey.getEndpoint().token.first(), + TSSEndpointData(tssi.id(), tssi.getKey.getEndpoint(), metrics)); + queueModel.updateTssEndpoint(ssi.getKeyValues.getEndpoint().token.first(), + TSSEndpointData(tssi.id(), tssi.getKeyValues.getEndpoint(), metrics)); + queueModel.updateTssEndpoint(ssi.getMappedKeyValues.getEndpoint().token.first(), + TSSEndpointData(tssi.id(), tssi.getMappedKeyValues.getEndpoint(), metrics)); + queueModel.updateTssEndpoint(ssi.getKeyValuesStream.getEndpoint().token.first(), + TSSEndpointData(tssi.id(), tssi.getKeyValuesStream.getEndpoint(), metrics)); + + // non-data requests duplicated for load + queueModel.updateTssEndpoint(ssi.watchValue.getEndpoint().token.first(), + TSSEndpointData(tssi.id(), tssi.watchValue.getEndpoint(), metrics)); + queueModel.updateTssEndpoint(ssi.splitMetrics.getEndpoint().token.first(), + TSSEndpointData(tssi.id(), tssi.splitMetrics.getEndpoint(), metrics)); + queueModel.updateTssEndpoint(ssi.getReadHotRanges.getEndpoint().token.first(), + TSSEndpointData(tssi.id(), tssi.getReadHotRanges.getEndpoint(), metrics)); + queueModel.updateTssEndpoint(ssi.getRangeSplitPoints.getEndpoint().token.first(), + TSSEndpointData(tssi.id(), tssi.getRangeSplitPoints.getEndpoint(), metrics)); + } +} + +void DatabaseContext::removeTssMapping(StorageServerInterface const& ssi) { + auto result = tssMapping.find(ssi.id()); + if (result != tssMapping.end()) { + tssMetrics.erase(ssi.id()); + tssMapping.erase(result); + queueModel.removeTssEndpoint(ssi.getValue.getEndpoint().token.first()); + queueModel.removeTssEndpoint(ssi.getKey.getEndpoint().token.first()); + queueModel.removeTssEndpoint(ssi.getKeyValues.getEndpoint().token.first()); + queueModel.removeTssEndpoint(ssi.getMappedKeyValues.getEndpoint().token.first()); + queueModel.removeTssEndpoint(ssi.getKeyValuesStream.getEndpoint().token.first()); + + queueModel.removeTssEndpoint(ssi.watchValue.getEndpoint().token.first()); + queueModel.removeTssEndpoint(ssi.splitMetrics.getEndpoint().token.first()); + queueModel.removeTssEndpoint(ssi.getReadHotRanges.getEndpoint().token.first()); + queueModel.removeTssEndpoint(ssi.getRangeSplitPoints.getEndpoint().token.first()); + } +} + +void DatabaseContext::addSSIdTagMapping(const UID& uid, const Tag& tag) { + ssidTagMapping[uid] = tag; +} + +void DatabaseContext::getLatestCommitVersionForSSID(const UID& ssid, Tag& tag, Version& commitVersion) { + tag = invalidTag; + commitVersion = invalidVersion; + + auto iter = ssidTagMapping.find(ssid); + if (iter != ssidTagMapping.end()) { + tag = iter->second; + + if (ssVersionVectorCache.hasVersion(tag)) { + commitVersion = ssVersionVectorCache.getVersion(tag); + } + } +} + +void DatabaseContext::getLatestCommitVersion(const StorageServerInterface& ssi, + Version readVersion, + VersionVector& latestCommitVersion) { + latestCommitVersion.clear(); + + if (ssVersionVectorCache.getMaxVersion() == invalidVersion) { + return; + } + + // Error checking (based on the assumption that the read version was not obtained + // from the client's grv cache). + if (readVersion > ssVersionVectorCache.getMaxVersion()) { + TraceEvent(SevError, "ReadVersionExceedsVersionVectorMax") + .detail("ReadVersion", readVersion) + .detail("VersionVector", ssVersionVectorCache.toString()); + if (g_network->isSimulated()) { + ASSERT(false); + } else { + return; // Do not return a stale commit version in production. + } + } + + Tag tag = invalidTag; + Version commitVersion = invalidVersion; + getLatestCommitVersionForSSID(ssi.id(), tag, commitVersion); + + if (tag != invalidTag && commitVersion != invalidVersion && commitVersion < readVersion) { + latestCommitVersion.setVersion(tag, commitVersion); + } +} + +void DatabaseContext::getLatestCommitVersions(const Reference& locationInfo, + Reference info, + VersionVector& latestCommitVersions) { + latestCommitVersions.clear(); + + if (info->readOptions.present() && info->readOptions.get().debugID.present()) { + g_traceBatch.addEvent( + "TransactionDebug", info->readOptions.get().debugID.get().first(), "NativeAPI.getLatestCommitVersions"); + } + + if (!info->readVersionObtainedFromGrvProxy) { + return; + } + + if (ssVersionVectorCache.getMaxVersion() == invalidVersion) { + return; + } + + if (info->readVersion() > ssVersionVectorCache.getMaxVersion()) { + if (!CLIENT_KNOBS->FORCE_GRV_CACHE_OFF && !info->options.skipGrvCache && info->options.useGrvCache) { + return; + } else { + TraceEvent(SevError, "GetLatestCommitVersions") + .detail("ReadVersion", info->readVersion()) + .detail("VersionVector", ssVersionVectorCache.toString()); + ASSERT(false); + } + } + + std::map> versionMap; // order the versions to be returned + for (int i = 0; i < locationInfo->locations()->size(); i++) { + Tag tag = invalidTag; + Version commitVersion = invalidVersion; // latest commit version + getLatestCommitVersionForSSID(locationInfo->locations()->getId(i), tag, commitVersion); + + bool updatedVersionMap = false; + if (tag != invalidTag && commitVersion != invalidVersion && commitVersion < info->readVersion()) { + updatedVersionMap = true; + versionMap[commitVersion].insert(tag); + } + + // Do not log if commitVersion >= readVersion. + if (!updatedVersionMap && commitVersion == invalidVersion) { + TraceEvent(SevDebug, "CommitVersionNotFoundForSS") + .detail("InSSIDMap", tag != invalidTag ? 1 : 0) + .detail("Tag", tag) + .detail("CommitVersion", commitVersion) + .detail("ReadVersion", info->readVersion()) + .detail("VersionVector", ssVersionVectorCache.toString()) + .setMaxEventLength(11000) + .setMaxFieldLength(10000); + ++transactionCommitVersionNotFoundForSS; + } + } + + // insert the commit versions in the version vector. + for (auto& iter : versionMap) { + latestCommitVersions.setVersion(iter.second, iter.first); + } +} + +void updateCachedReadVersionShared(double t, Version v, DatabaseSharedState* p) { + MutexHolder mutex(p->mutexLock); + if (v >= p->grvCacheSpace.cachedReadVersion) { + //TraceEvent(SevDebug, "CacheReadVersionUpdate") + // .detail("Version", v) + // .detail("CurTime", t) + // .detail("LastVersion", p->grvCacheSpace.cachedReadVersion) + // .detail("LastTime", p->grvCacheSpace.lastGrvTime); + p->grvCacheSpace.cachedReadVersion = v; + if (t > p->grvCacheSpace.lastGrvTime) { + p->grvCacheSpace.lastGrvTime = t; + } + } +} + +void DatabaseContext::updateCachedReadVersion(double t, Version v) { + if (sharedStatePtr) { + return updateCachedReadVersionShared(t, v, sharedStatePtr); + } + if (v >= cachedReadVersion) { + //TraceEvent(SevDebug, "CachedReadVersionUpdate") + // .detail("Version", v) + // .detail("GrvStartTime", t) + // .detail("LastVersion", cachedReadVersion) + // .detail("LastTime", lastGrvTime); + cachedReadVersion = v; + // Since the time is based on the start of the request, it's possible that we + // get a newer version with an older time. + // (Request started earlier, but was latest to reach the proxy) + // Only update time when strictly increasing (?) + if (t > lastGrvTime) { + lastGrvTime = t; + } + } +} + +Version DatabaseContext::getCachedReadVersion() { + if (sharedStatePtr) { + MutexHolder mutex(sharedStatePtr->mutexLock); + return sharedStatePtr->grvCacheSpace.cachedReadVersion; + } + return cachedReadVersion; +} + +double DatabaseContext::getLastGrvTime() { + if (sharedStatePtr) { + MutexHolder mutex(sharedStatePtr->mutexLock); + return sharedStatePtr->grvCacheSpace.lastGrvTime; + } + return lastGrvTime; +} + +Reference StorageServerInfo::getInterface(DatabaseContext* cx, + StorageServerInterface const& ssi, + LocalityData const& locality) { + auto it = cx->server_interf.find(ssi.id()); + if (it != cx->server_interf.end()) { + if (it->second->interf.getValue.getEndpoint().token != ssi.getValue.getEndpoint().token) { + if (it->second->interf.locality == ssi.locality) { + // FIXME: load balance holds pointers to individual members of the interface, and this assignment will + // swap out the object they are + // pointing to. This is technically correct, but is very unnatural. We may want to refactor load + // balance to take an AsyncVar> so that it is notified when the interface + // changes. + + it->second->interf = ssi; + } else { + it->second->notifyContextDestroyed(); + Reference loc(new StorageServerInfo(cx, ssi, locality)); + cx->server_interf[ssi.id()] = loc.getPtr(); + return loc; + } + } + + return Reference::addRef(it->second); + } + + Reference loc(new StorageServerInfo(cx, ssi, locality)); + cx->server_interf[ssi.id()] = loc.getPtr(); + return loc; +} + +void StorageServerInfo::notifyContextDestroyed() { + cx = nullptr; +} + +StorageServerInfo::~StorageServerInfo() { + if (cx) { + auto it = cx->server_interf.find(interf.id()); + if (it != cx->server_interf.end()) + cx->server_interf.erase(it); + cx = nullptr; + } +} + +void DatabaseContext::validateVersion(Version version) const { + // Version could be 0 if the INITIALIZE_NEW_DATABASE option is set. In that case, it is illegal to perform any + // reads. We throw client_invalid_operation because the caller didn't directly set the version, so the + // version_invalid error might be confusing. + if (version == 0) { + throw client_invalid_operation(); + } + if (switchable && version < minAcceptableReadVersion) { + CODE_PROBE(true, "Attempted to read a version lower than any this client has seen from the current cluster"); + throw transaction_too_old(); + } + + ASSERT(version > 0 || version == latestVersion); +} + +inline HealthMetrics populateHealthMetrics(const HealthMetrics& detailedMetrics, bool detailedOutput) { + if (detailedOutput) { + return detailedMetrics; + } else { + HealthMetrics result; + result.update(detailedMetrics, false, false); + return result; + } +} + +ACTOR static Future getHealthMetricsActor(DatabaseContext* cx, bool detailed, bool sendDetailedRequest) { + loop { + choose { + when(wait(cx->onProxiesChanged())) {} + when(GetHealthMetricsReply rep = wait(basicLoadBalance(cx->getGrvProxies(UseProvisionalProxies::False), + &GrvProxyInterface::getHealthMetrics, + GetHealthMetricsRequest(sendDetailedRequest)))) { + cx->healthMetrics.update(rep.healthMetrics, sendDetailedRequest, true); + cx->healthMetricsLastUpdated = now(); + if (sendDetailedRequest) { + cx->detailedHealthMetricsLastUpdated = now(); + } + return populateHealthMetrics(cx->healthMetrics, detailed); + } + } + } +} + +Future DatabaseContext::getHealthMetrics(bool detailed = false) { + if (now() - healthMetricsLastUpdated < CLIENT_KNOBS->AGGREGATE_HEALTH_METRICS_MAX_STALENESS) { + return populateHealthMetrics(healthMetrics, detailed); + } + bool sendDetailedRequest = + detailed && now() - detailedHealthMetricsLastUpdated > CLIENT_KNOBS->DETAILED_HEALTH_METRICS_MAX_STALENESS; + return getHealthMetricsActor(this, detailed, sendDetailedRequest); +} + +Future> DatabaseContext::getStorageStats(const UID& id, double maxStaleness) { + if (now() - detailedHealthMetricsLastUpdated < maxStaleness) { + auto it = healthMetrics.storageStats.find(id); + return it == healthMetrics.storageStats.end() ? Optional() : it->second; + } + + return map(getHealthMetricsActor(this, true, true), [&id](auto metrics) -> Optional { + auto it = metrics.storageStats.find(id); + return it == metrics.storageStats.end() ? Optional() : it->second; + }); +} + +// register a special key(s) implementation under the specified module +void DatabaseContext::registerSpecialKeysImpl(SpecialKeySpace::MODULE module, + SpecialKeySpace::IMPLTYPE type, + std::unique_ptr&& impl, + int deprecatedVersion) { + // if deprecated, add the implementation when the api version is less than the deprecated version + if (deprecatedVersion == -1 || apiVersion.version() < deprecatedVersion) { + specialKeySpace->registerKeyRange(module, type, impl->getKeyRange(), impl.get()); + specialKeySpaceModules.push_back(std::move(impl)); + } +} + +void traceTSSErrors(const char* name, UID tssId, const std::unordered_map& errorsByCode) { + TraceEvent ev(name, tssId); + for (auto& it : errorsByCode) { + ev.detail("E" + std::to_string(it.first), it.second); + } +} + +/* + For each request type, this will produce + Count + {SS,TSS}{Mean,P50,P90,P99} + Example: + GetValueLatencySSMean +*/ +void traceSSOrTSSPercentiles(TraceEvent& ev, const std::string name, DDSketch& sample) { + ev.detail(name + "Mean", sample.mean()); + // don't log the larger percentiles unless we actually have enough samples to log the accurate percentile instead of + // the largest sample in this window + if (sample.getPopulationSize() >= 3) { + ev.detail(name + "P50", sample.median()); + } + if (sample.getPopulationSize() >= 10) { + ev.detail(name + "P90", sample.percentile(0.90)); + } + if (sample.getPopulationSize() >= 100) { + ev.detail(name + "P99", sample.percentile(0.99)); + } +} + +void traceTSSPercentiles(TraceEvent& ev, + const std::string name, + DDSketch& ssSample, + DDSketch& tssSample) { + ASSERT(ssSample.getPopulationSize() == tssSample.getPopulationSize()); + ev.detail(name + "Count", ssSample.getPopulationSize()); + if (ssSample.getPopulationSize() > 0) { + traceSSOrTSSPercentiles(ev, name + "SS", ssSample); + traceSSOrTSSPercentiles(ev, name + "TSS", tssSample); + } +} + +ACTOR Future tssLogger(DatabaseContext* cx) { + state double lastLogged = 0; + loop { + wait(delay(CLIENT_KNOBS->TSS_METRICS_LOGGING_INTERVAL, TaskPriority::FlushTrace)); + + // Log each TSS pair separately + for (const auto& it : cx->tssMetrics) { + if (it.second->detailedMismatches.size()) { + cx->tssMismatchStream.send( + std::pair>(it.first, it.second->detailedMismatches)); + } + + // Do error histograms as separate event + if (it.second->ssErrorsByCode.size()) { + traceTSSErrors("TSS_SSErrors", it.first, it.second->ssErrorsByCode); + } + + if (it.second->tssErrorsByCode.size()) { + traceTSSErrors("TSS_TSSErrors", it.first, it.second->tssErrorsByCode); + } + + TraceEvent tssEv("TSSClientMetrics", cx->dbId); + tssEv.detail("TSSID", it.first) + .detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged) + .detail("Internal", cx->internal); + + it.second->cc.logToTraceEvent(tssEv); + + traceTSSPercentiles(tssEv, "GetValueLatency", it.second->SSgetValueLatency, it.second->TSSgetValueLatency); + traceTSSPercentiles( + tssEv, "GetKeyValuesLatency", it.second->SSgetKeyValuesLatency, it.second->TSSgetKeyValuesLatency); + traceTSSPercentiles(tssEv, "GetKeyLatency", it.second->SSgetKeyLatency, it.second->TSSgetKeyLatency); + traceTSSPercentiles(tssEv, + "GetMappedKeyValuesLatency", + it.second->SSgetMappedKeyValuesLatency, + it.second->TSSgetMappedKeyValuesLatency); + + it.second->clear(); + } + + lastLogged = now(); + } +} + +ACTOR Future databaseLogger(DatabaseContext* cx) { + state double lastLogged = 0; + loop { + wait(delay(CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskPriority::FlushTrace)); + + bool logMetrics = !g_network->isSimulated() || BUGGIFY_WITH_PROB(0.01); + if (logMetrics) { + TraceEvent ev("TransactionMetrics", cx->dbId); + + ev.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged) + .detail("Cluster", + cx->getConnectionRecord() + ? cx->getConnectionRecord()->getConnectionString().clusterKeyName().toString() + : "") + .detail("Internal", cx->internal); + + cx->cc.logToTraceEvent(ev); + + ev.detail("LocationCacheEntryCount", cx->locationCache.size()); + ev.detail("MeanLatency", cx->latencies.mean()) + .detail("MedianLatency", cx->latencies.median()) + .detail("Latency90", cx->latencies.percentile(0.90)) + .detail("Latency98", cx->latencies.percentile(0.98)) + .detail("MaxLatency", cx->latencies.max()) + .detail("MeanRowReadLatency", cx->readLatencies.mean()) + .detail("MedianRowReadLatency", cx->readLatencies.median()) + .detail("MaxRowReadLatency", cx->readLatencies.max()) + .detail("MeanGRVLatency", cx->GRVLatencies.mean()) + .detail("MedianGRVLatency", cx->GRVLatencies.median()) + .detail("MaxGRVLatency", cx->GRVLatencies.max()) + .detail("MeanCommitLatency", cx->commitLatencies.mean()) + .detail("MedianCommitLatency", cx->commitLatencies.median()) + .detail("MaxCommitLatency", cx->commitLatencies.max()) + .detail("MeanMutationsPerCommit", cx->mutationsPerCommit.mean()) + .detail("MedianMutationsPerCommit", cx->mutationsPerCommit.median()) + .detail("MaxMutationsPerCommit", cx->mutationsPerCommit.max()) + .detail("MeanBytesPerCommit", cx->bytesPerCommit.mean()) + .detail("MedianBytesPerCommit", cx->bytesPerCommit.median()) + .detail("MaxBytesPerCommit", cx->bytesPerCommit.max()) + .detail("NumLocalityCacheEntries", cx->locationCache.size()); + } + + cx->latencies.clear(); + cx->readLatencies.clear(); + cx->GRVLatencies.clear(); + cx->commitLatencies.clear(); + cx->mutationsPerCommit.clear(); + cx->bytesPerCommit.clear(); + + lastLogged = now(); + } +} + +struct TrInfoChunk { + ValueRef value; + Key key; +}; + +static const Key CLIENT_LATENCY_INFO_PREFIX = "client_latency/"_sr; +static const Key CLIENT_LATENCY_INFO_CTR_PREFIX = "client_latency_counter/"_sr; + +ACTOR static Future transactionInfoCommitActor(Transaction* tr, std::vector* chunks) { + state const Key clientLatencyAtomicCtr = CLIENT_LATENCY_INFO_CTR_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin); + state int retryCount = 0; + loop { + try { + tr->reset(); + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + state Future> vstamp = tr->getVersionstamp(); + int64_t numCommitBytes = 0; + for (auto& chunk : *chunks) { + tr->atomicOp(chunk.key, chunk.value, MutationRef::SetVersionstampedKey); + numCommitBytes += chunk.key.size() + chunk.value.size() - + 4; // subtract number of bytes of key that denotes version stamp index + } + tr->atomicOp(clientLatencyAtomicCtr, StringRef((uint8_t*)&numCommitBytes, 8), MutationRef::AddValue); + wait(tr->commit()); + return Void(); + } catch (Error& e) { + retryCount++; + if (retryCount == 10) + throw; + wait(tr->onError(e)); + } + } +} + +ACTOR static Future delExcessClntTxnEntriesActor(Transaction* tr, int64_t clientTxInfoSizeLimit) { + state const Key clientLatencyName = CLIENT_LATENCY_INFO_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin); + state const Key clientLatencyAtomicCtr = CLIENT_LATENCY_INFO_CTR_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin); + TraceEvent(SevInfo, "DelExcessClntTxnEntriesCalled").log(); + + // If we don't limit it with retries, the DatabaseContext will never cleanup as Transaction + // object will be alive and hold reference to DatabaseContext. + state int retries = 0; + loop { + try { + tr->reset(); + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + Optional ctrValue = wait(tr->get(KeyRef(clientLatencyAtomicCtr), Snapshot::True)); + if (!ctrValue.present()) { + TraceEvent(SevInfo, "NumClntTxnEntriesNotFound").log(); + return Void(); + } + state int64_t txInfoSize = 0; + ASSERT(ctrValue.get().size() == sizeof(int64_t)); + memcpy(&txInfoSize, ctrValue.get().begin(), ctrValue.get().size()); + if (txInfoSize < clientTxInfoSizeLimit) + return Void(); + int getRangeByteLimit = (txInfoSize - clientTxInfoSizeLimit) < CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT + ? (txInfoSize - clientTxInfoSizeLimit) + : CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT; + GetRangeLimits limit(GetRangeLimits::ROW_LIMIT_UNLIMITED, getRangeByteLimit); + RangeResult txEntries = + wait(tr->getRange(KeyRangeRef(clientLatencyName, strinc(clientLatencyName)), limit)); + state int64_t numBytesToDel = 0; + KeyRef endKey; + for (auto& kv : txEntries) { + endKey = kv.key; + numBytesToDel += kv.key.size() + kv.value.size(); + if (txInfoSize - numBytesToDel <= clientTxInfoSizeLimit) + break; + } + if (numBytesToDel) { + tr->clear(KeyRangeRef(txEntries[0].key, strinc(endKey))); + TraceEvent(SevInfo, "DeletingExcessCntTxnEntries").detail("BytesToBeDeleted", numBytesToDel); + int64_t bytesDel = -numBytesToDel; + + tr->atomicOp(clientLatencyAtomicCtr, StringRef((uint8_t*)&bytesDel, 8), MutationRef::AddValue); + wait(tr->commit()); + } + if (txInfoSize - numBytesToDel <= clientTxInfoSizeLimit) + return Void(); + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled || retries++ >= 10) { + throw; + } + + wait(tr->onError(e)); + } + } +} + +// FIXME: explain what "client status" is +// The reason for getting a pointer to DatabaseContext instead of a reference counted object is because reference +// counting will increment reference count for DatabaseContext which holds the future of this actor. This creates a +// cyclic reference and hence this actor and Database object will not be destroyed at all. +ACTOR static Future clientStatusUpdateActor(DatabaseContext* cx) { + state const std::string clientLatencyName = + CLIENT_LATENCY_INFO_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin).toString(); + state Transaction tr; + state std::vector commitQ; + state int txBytes = 0; + + loop { + // Make sure we are connected to the server. Otherwise we may just try to keep reconnecting + // with incompatible clusters. + wait(cx->onConnected()); + + // Need to make sure that we eventually destroy tr. We can't rely on getting cancelled to do + // this because of the cyclic reference to self. + wait(refreshTransaction(cx, &tr)); + try { + ASSERT(cx->clientStatusUpdater.outStatusQ.empty()); + cx->clientStatusUpdater.inStatusQ.swap(cx->clientStatusUpdater.outStatusQ); + // Split Transaction Info into chunks + state std::vector trChunksQ; + for (auto& entry : cx->clientStatusUpdater.outStatusQ) { + auto& bw = entry.second; + int64_t value_size_limit = BUGGIFY + ? deterministicRandom()->randomInt(1e3, CLIENT_KNOBS->VALUE_SIZE_LIMIT) + : CLIENT_KNOBS->VALUE_SIZE_LIMIT; + int num_chunks = (bw.getLength() + value_size_limit - 1) / value_size_limit; + std::string random_id = deterministicRandom()->randomAlphaNumeric(16); + std::string user_provided_id = entry.first.size() ? entry.first + "/" : ""; + for (int i = 0; i < num_chunks; i++) { + TrInfoChunk chunk; + BinaryWriter chunkBW(Unversioned()); + chunkBW << bigEndian32(i + 1) << bigEndian32(num_chunks); + chunk.key = KeyRef(clientLatencyName + std::string(10, '\x00') + "/" + random_id + "/" + + chunkBW.toValue().toString() + "/" + user_provided_id + std::string(4, '\x00')); + int32_t pos = littleEndian32(clientLatencyName.size()); + memcpy(mutateString(chunk.key) + chunk.key.size() - sizeof(int32_t), &pos, sizeof(int32_t)); + if (i == num_chunks - 1) { + chunk.value = ValueRef(static_cast(bw.getData()) + (i * value_size_limit), + bw.getLength() - (i * value_size_limit)); + } else { + chunk.value = + ValueRef(static_cast(bw.getData()) + (i * value_size_limit), value_size_limit); + } + trChunksQ.push_back(std::move(chunk)); + } + } + + // Commit the chunks splitting into different transactions if needed + state int64_t dataSizeLimit = + BUGGIFY ? deterministicRandom()->randomInt(200e3, 1.5 * CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT) + : 0.8 * CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT; + state std::vector::iterator tracking_iter = trChunksQ.begin(); + ASSERT(commitQ.empty() && (txBytes == 0)); + loop { + state std::vector::iterator iter = tracking_iter; + txBytes = 0; + commitQ.clear(); + try { + while (iter != trChunksQ.end()) { + if (iter->value.size() + iter->key.size() + txBytes > dataSizeLimit) { + wait(transactionInfoCommitActor(&tr, &commitQ)); + tracking_iter = iter; + commitQ.clear(); + txBytes = 0; + } + commitQ.push_back(*iter); + txBytes += iter->value.size() + iter->key.size(); + ++iter; + } + if (!commitQ.empty()) { + wait(transactionInfoCommitActor(&tr, &commitQ)); + commitQ.clear(); + txBytes = 0; + } + break; + } catch (Error& e) { + if (e.code() == error_code_transaction_too_large) { + dataSizeLimit /= 2; + ASSERT(dataSizeLimit >= CLIENT_KNOBS->VALUE_SIZE_LIMIT + CLIENT_KNOBS->KEY_SIZE_LIMIT); + } else { + TraceEvent(SevWarnAlways, "ClientTrInfoErrorCommit").error(e).detail("TxBytes", txBytes); + commitQ.clear(); + txBytes = 0; + throw; + } + } + } + cx->clientStatusUpdater.outStatusQ.clear(); + wait(cx->globalConfig->onInitialized()); + double sampleRate = + cx->globalConfig->get(fdbClientInfoTxnSampleRate, std::numeric_limits::infinity()); + double clientSamplingProbability = + std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate; + int64_t sizeLimit = cx->globalConfig->get(fdbClientInfoTxnSizeLimit, -1); + int64_t clientTxnInfoSizeLimit = sizeLimit == -1 ? CLIENT_KNOBS->CSI_SIZE_LIMIT : sizeLimit; + if (!trChunksQ.empty() && deterministicRandom()->random01() < clientSamplingProbability) + wait(delExcessClntTxnEntriesActor(&tr, clientTxnInfoSizeLimit)); + + // Cleanup Transaction sooner than later, so that we don't hold reference to context. + tr = Transaction(); + wait(delay(CLIENT_KNOBS->CSI_STATUS_DELAY)); + } catch (Error& e) { + TraceEvent(SevWarnAlways, "UnableToWriteClientStatus").error(e); + if (e.code() == error_code_actor_cancelled) { + throw; + } + cx->clientStatusUpdater.outStatusQ.clear(); + + // Cleanup Transaction sooner than later, so that we don't hold reference to context. + tr = Transaction(); + wait(delay(10.0)); + } + } +} + +ACTOR Future assertFailure(GrvProxyInterface remote, Future> reply) { + try { + ErrorOr res = wait(reply); + if (!res.isError()) { + TraceEvent(SevError, "GotStaleReadVersion") + .detail("Remote", remote.getConsistentReadVersion.getEndpoint().addresses.address.toString()) + .detail("Provisional", remote.provisional) + .detail("ReadVersion", res.get().version); + ASSERT_WE_THINK(false); + } + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) { + throw; + } + // we want this to fail -- so getting here is good, we'll just ignore the error. + } + return Void(); +} + +Future attemptGRVFromOldProxies(std::vector oldProxies, + std::vector newProxies) { + auto debugID = nondeterministicRandom()->randomUniqueID(); + g_traceBatch.addEvent("AttemptGRVFromOldProxyDebug", debugID.first(), "NativeAPI.attemptGRVFromOldProxies.Start"); + Span span("NAPI:VerifyCausalReadRisky"_loc); + std::vector> replies; + replies.reserve(oldProxies.size()); + GetReadVersionRequest req( + span.context, 1, TransactionPriority::IMMEDIATE, GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY); + TraceEvent evt("AttemptGRVFromOldProxies"); + evt.detail("NumOldProxies", oldProxies.size()).detail("NumNewProxies", newProxies.size()); + auto traceProxies = [&](std::vector& proxies, std::string const& key) { + for (int i = 0; i < proxies.size(); ++i) { + auto k = key + std::to_string(i); + evt.detail(k.c_str(), proxies[i].id()); + } + }; + traceProxies(oldProxies, "OldProxy"s); + traceProxies(newProxies, "NewProxy"s); + evt.log(); + for (auto& i : oldProxies) { + req.reply = ReplyPromise(); + replies.push_back(assertFailure(i, i.getConsistentReadVersion.tryGetReply(req))); + } + return waitForAll(replies); +} + +ACTOR static Future monitorClientDBInfoChange(DatabaseContext* cx, + Reference const> clientDBInfo, + AsyncTrigger* proxiesChangeTrigger) { + state std::vector curCommitProxies; + state std::vector curGrvProxies; + state ActorCollection actors(false); + state Future clientDBInfoOnChange = clientDBInfo->onChange(); + curCommitProxies = clientDBInfo->get().commitProxies; + curGrvProxies = clientDBInfo->get().grvProxies; + + loop { + choose { + when(wait(clientDBInfoOnChange)) { + clientDBInfoOnChange = clientDBInfo->onChange(); + if (clientDBInfo->get().commitProxies != curCommitProxies || + clientDBInfo->get().grvProxies != curGrvProxies) { + // This condition is a bit complicated. Here we want to verify that we're unable to receive a read + // version from a proxy of an old generation after a successful recovery. The conditions are: + // 1. We only do this with a configured probability. + // 2. If the old set of Grv proxies is empty, there's nothing to do + // 3. If the new set of Grv proxies is empty, it means the recovery is not complete. So if an old + // Grv proxy still gives out read versions, this would be correct behavior. + // 4. If we see a provisional proxy, it means the recovery didn't complete yet, so the same as (3) + // applies. + if (deterministicRandom()->random01() < cx->verifyCausalReadsProp && !curGrvProxies.empty() && + !clientDBInfo->get().grvProxies.empty() && !clientDBInfo->get().grvProxies[0].provisional) { + actors.add(attemptGRVFromOldProxies(curGrvProxies, clientDBInfo->get().grvProxies)); + } + curCommitProxies = clientDBInfo->get().commitProxies; + curGrvProxies = clientDBInfo->get().grvProxies; + // Commits in the previous epoch may have been recovered but not included in the version vector. + // Clear the version vector to ensure the latest commit versions are received. + cx->ssVersionVectorCache.clear(); + proxiesChangeTrigger->trigger(); + } + } + when(wait(actors.getResult())) { + UNSTOPPABLE_ASSERT(false); + } + } + } +} + +void updateLocationCacheWithCaches(DatabaseContext* self, + const std::map& removed, + const std::map& added) { + // TODO: this needs to be more clever in the future + auto ranges = self->locationCache.ranges(); + for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) { + if (iter->value() && iter->value()->hasCaches) { + auto& val = iter->value(); + std::vector>> interfaces; + interfaces.reserve(val->size() - removed.size() + added.size()); + for (int i = 0; i < val->size(); ++i) { + const auto& interf = (*val)[i]; + if (removed.count(interf->interf.id()) == 0) { + interfaces.emplace_back(interf); + } + } + for (const auto& p : added) { + interfaces.push_back(makeReference>(p.second)); + } + iter->value() = makeReference(interfaces, true); + } + } +} + +ACTOR static Future handleTssMismatches(DatabaseContext* cx) { + state Reference tr; + state KeyBackedMap tssMapDB = KeyBackedMap(tssMappingKeys.begin); + state KeyBackedMap tssMismatchDB = KeyBackedMap(tssMismatchKeys.begin); + loop { + // return to calling actor, cx might be destroyed already with the tr reset below. + // This gives ~DatabaseContext a chance to cancel this actor. + wait(delay(0)); + + // + state std::pair> data = waitNext(cx->tssMismatchStream.getFuture()); + // find ss pair id so we can remove it from the mapping + state UID tssPairID; + bool found = false; + for (const auto& it : cx->tssMapping) { + if (it.second.id() == data.first) { + tssPairID = it.first; + found = true; + break; + } + } + if (found) { + state bool quarantine = CLIENT_KNOBS->QUARANTINE_TSS_ON_MISMATCH; + TraceEvent(SevWarnAlways, quarantine ? "TSS_QuarantineMismatch" : "TSS_KillMismatch") + .detail("TSSID", data.first.toString()); + CODE_PROBE(quarantine, "Quarantining TSS because it got mismatch"); + CODE_PROBE(!quarantine, "Killing TSS because it got mismatch"); + + tr = makeReference(Database(Reference::addRef(cx))); + state int tries = 0; + loop { + try { + tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + if (quarantine) { + tr->set(tssQuarantineKeyFor(data.first), ""_sr); + } else { + tr->clear(serverTagKeyFor(data.first)); + } + tssMapDB.erase(tr, tssPairID); + + for (const DetailedTSSMismatch& d : data.second) { + // -> mismatch data + tssMismatchDB.set(tr, + Tuple::makeTuple(data.first.toString(), d.timestamp, d.mismatchId.toString()), + d.traceString); + } + + wait(tr->commit()); + + break; + } catch (Error& e) { + wait(tr->onError(e)); + } + tries++; + if (tries > 10) { + // Give up, it'll get another mismatch or a human will investigate eventually + TraceEvent("TSS_MismatchGaveUp").detail("TSSID", data.first.toString()); + break; + } + } + // clear out txn so that the extra DatabaseContext ref gets decref'd and we can free cx + tr = makeReference(); + } else { + CODE_PROBE(true, "Not handling TSS with mismatch because it's already gone"); + } + } +} + +ACTOR Future> getJSON(Database db, std::string jsonField = ""); + +struct SingleSpecialKeyImpl : SpecialKeyRangeReadImpl { + Future getRange(ReadYourWritesTransaction* ryw, + KeyRangeRef kr, + GetRangeLimits limitsHint) const override { + ASSERT(kr.contains(k)); + return map(f(ryw), [k = k](Optional v) { + RangeResult result; + if (v.present()) { + result.push_back_deep(result.arena(), KeyValueRef(k, v.get())); + } + return result; + }); + } + + SingleSpecialKeyImpl(KeyRef k, + const std::function>(ReadYourWritesTransaction*)>& f, + bool supportsTenants = false) + : SpecialKeyRangeReadImpl(singleKeyRange(k)), k(k), f(f), tenantSupport(supportsTenants) {} + + bool supportsTenants() const override { + CODE_PROBE(tenantSupport, "Single special key in tenant"); + return tenantSupport; + }; + +private: + Key k; + std::function>(ReadYourWritesTransaction*)> f; + bool tenantSupport; +}; + +class HealthMetricsRangeImpl : public SpecialKeyRangeAsyncImpl { +public: + explicit HealthMetricsRangeImpl(KeyRangeRef kr); + Future getRange(ReadYourWritesTransaction* ryw, + KeyRangeRef kr, + GetRangeLimits limitsHint) const override; +}; + +static RangeResult healthMetricsToKVPairs(const HealthMetrics& metrics, KeyRangeRef kr) { + RangeResult result; + if (CLIENT_BUGGIFY) + return result; + if (kr.contains("\xff\xff/metrics/health/aggregate"_sr) && metrics.worstStorageDurabilityLag != 0) { + json_spirit::mObject statsObj; + statsObj["batch_limited"] = metrics.batchLimited; + statsObj["tps_limit"] = metrics.tpsLimit; + statsObj["worst_storage_durability_lag"] = metrics.worstStorageDurabilityLag; + statsObj["limiting_storage_durability_lag"] = metrics.limitingStorageDurabilityLag; + statsObj["worst_storage_queue"] = metrics.worstStorageQueue; + statsObj["limiting_storage_queue"] = metrics.limitingStorageQueue; + statsObj["worst_log_queue"] = metrics.worstTLogQueue; + std::string statsString = + json_spirit::write_string(json_spirit::mValue(statsObj), json_spirit::Output_options::raw_utf8); + ValueRef bytes(result.arena(), statsString); + result.push_back(result.arena(), KeyValueRef("\xff\xff/metrics/health/aggregate"_sr, bytes)); + } + // tlog stats + { + int phase = 0; // Avoid comparing twice per loop iteration + for (const auto& [uid, logStats] : metrics.tLogQueue) { + StringRef k{ StringRef(uid.toString()).withPrefix("\xff\xff/metrics/health/log/"_sr, result.arena()) }; + if (phase == 0 && k >= kr.begin) { + phase = 1; + } + if (phase == 1) { + if (k < kr.end) { + json_spirit::mObject statsObj; + statsObj["log_queue"] = logStats; + std::string statsString = + json_spirit::write_string(json_spirit::mValue(statsObj), json_spirit::Output_options::raw_utf8); + ValueRef bytes(result.arena(), statsString); + result.push_back(result.arena(), KeyValueRef(k, bytes)); + } else { + break; + } + } + } + } + // Storage stats + { + int phase = 0; // Avoid comparing twice per loop iteration + for (const auto& [uid, storageStats] : metrics.storageStats) { + StringRef k{ StringRef(uid.toString()).withPrefix("\xff\xff/metrics/health/storage/"_sr, result.arena()) }; + if (phase == 0 && k >= kr.begin) { + phase = 1; + } + if (phase == 1) { + if (k < kr.end) { + json_spirit::mObject statsObj; + statsObj["storage_durability_lag"] = storageStats.storageDurabilityLag; + statsObj["storage_queue"] = storageStats.storageQueue; + statsObj["cpu_usage"] = storageStats.cpuUsage; + statsObj["disk_usage"] = storageStats.diskUsage; + std::string statsString = + json_spirit::write_string(json_spirit::mValue(statsObj), json_spirit::Output_options::raw_utf8); + ValueRef bytes(result.arena(), statsString); + result.push_back(result.arena(), KeyValueRef(k, bytes)); + } else { + break; + } + } + } + } + return result; +} + +ACTOR static Future healthMetricsGetRangeActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) { + HealthMetrics metrics = wait(ryw->getDatabase()->getHealthMetrics( + /*detailed ("per process")*/ kr.intersects( + KeyRangeRef("\xff\xff/metrics/health/storage/"_sr, "\xff\xff/metrics/health/storage0"_sr)) || + kr.intersects(KeyRangeRef("\xff\xff/metrics/health/log/"_sr, "\xff\xff/metrics/health/log0"_sr)))); + return healthMetricsToKVPairs(metrics, kr); +} + +HealthMetricsRangeImpl::HealthMetricsRangeImpl(KeyRangeRef kr) : SpecialKeyRangeAsyncImpl(kr) {} + +Future HealthMetricsRangeImpl::getRange(ReadYourWritesTransaction* ryw, + KeyRangeRef kr, + GetRangeLimits limitsHint) const { + return healthMetricsGetRangeActor(ryw, kr); +} + +ACTOR Future getClusterId(Database db) { + while (!db->clientInfo->get().clusterId.isValid()) { + wait(db->clientInfo->onChange()); + } + return db->clientInfo->get().clusterId; +} + +void DatabaseContext::initializeSpecialCounters() { + specialCounter(cc, "OutstandingWatches", [this] { return outstandingWatches; }); + specialCounter(cc, "WatchMapSize", [this] { return watchMap.size(); }); +} + +DatabaseContext::DatabaseContext(Reference>> connectionRecord, + Reference> clientInfo, + Reference> const> coordinator, + Future clientInfoMonitor, + TaskPriority taskID, + LocalityData const& clientLocality, + EnableLocalityLoadBalance enableLocalityLoadBalance, + LockAware lockAware, + IsInternal internal, + int _apiVersion, + IsSwitchable switchable, + Optional defaultTenant) + : dbId(deterministicRandom()->randomUniqueID()), lockAware(lockAware), switchable(switchable), + connectionRecord(connectionRecord), proxyProvisional(false), clientLocality(clientLocality), + enableLocalityLoadBalance(enableLocalityLoadBalance), defaultTenant(defaultTenant), internal(internal), + cc("TransactionMetrics", dbId.toString()), transactionReadVersions("ReadVersions", cc), + transactionReadVersionsThrottled("ReadVersionsThrottled", cc), + transactionReadVersionsCompleted("ReadVersionsCompleted", cc), + transactionReadVersionBatches("ReadVersionBatches", cc), + transactionBatchReadVersions("BatchPriorityReadVersions", cc), + transactionDefaultReadVersions("DefaultPriorityReadVersions", cc), + transactionImmediateReadVersions("ImmediatePriorityReadVersions", cc), + transactionBatchReadVersionsCompleted("BatchPriorityReadVersionsCompleted", cc), + transactionDefaultReadVersionsCompleted("DefaultPriorityReadVersionsCompleted", cc), + transactionImmediateReadVersionsCompleted("ImmediatePriorityReadVersionsCompleted", cc), + transactionLogicalReads("LogicalUncachedReads", cc), transactionPhysicalReads("PhysicalReadRequests", cc), + transactionPhysicalReadsCompleted("PhysicalReadRequestsCompleted", cc), + transactionGetKeyRequests("GetKeyRequests", cc), transactionGetValueRequests("GetValueRequests", cc), + transactionGetRangeRequests("GetRangeRequests", cc), + transactionGetMappedRangeRequests("GetMappedRangeRequests", cc), + transactionGetRangeStreamRequests("GetRangeStreamRequests", cc), transactionWatchRequests("WatchRequests", cc), + transactionGetAddressesForKeyRequests("GetAddressesForKeyRequests", cc), transactionBytesRead("BytesRead", cc), + transactionKeysRead("KeysRead", cc), transactionMetadataVersionReads("MetadataVersionReads", cc), + transactionCommittedMutations("CommittedMutations", cc), + transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionSetMutations("SetMutations", cc), + transactionClearMutations("ClearMutations", cc), transactionAtomicMutations("AtomicMutations", cc), + transactionsCommitStarted("CommitStarted", cc), transactionsCommitCompleted("CommitCompleted", cc), + transactionKeyServerLocationRequests("KeyServerLocationRequests", cc), + transactionKeyServerLocationRequestsCompleted("KeyServerLocationRequestsCompleted", cc), + transactionStatusRequests("StatusRequests", cc), transactionTenantLookupRequests("TenantLookupRequests", cc), + transactionTenantLookupRequestsCompleted("TenantLookupRequestsCompleted", cc), transactionsTooOld("TooOld", cc), + transactionsFutureVersions("FutureVersions", cc), transactionsNotCommitted("NotCommitted", cc), + transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), + transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc), + transactionsLockRejected("LockRejected", cc), + transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc), + transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc), + transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), + + latencies(), readLatencies(), commitLatencies(), GRVLatencies(), mutationsPerCommit(), bytesPerCommit(), + outstandingWatches(0), sharedStatePtr(nullptr), lastGrvTime(0.0), cachedReadVersion(0), + lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0), lastProxyRequestTime(0.0), + transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), + coordinator(coordinator), apiVersion(_apiVersion), mvCacheInsertLocation(0), healthMetricsLastUpdated(0), + detailedHealthMetricsLastUpdated(0), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), + specialKeySpace(std::make_unique(specialKeys.begin, specialKeys.end, /* test */ false)), + connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) { + + DisabledTraceEvent("DatabaseContextCreated", dbId).backtrace(); + + connected = (clientInfo->get().commitProxies.size() && clientInfo->get().grvProxies.size()) + ? Void() + : clientInfo->onChange(); + + metadataVersionCache.resize(CLIENT_KNOBS->METADATA_VERSION_CACHE_SIZE); + maxOutstandingWatches = CLIENT_KNOBS->DEFAULT_MAX_OUTSTANDING_WATCHES; + + snapshotRywEnabled = apiVersion.hasSnapshotRYW() ? 1 : 0; + + logger = databaseLogger(this) && tssLogger(this); + locationCacheSize = g_network->isSimulated() ? CLIENT_KNOBS->LOCATION_CACHE_EVICTION_SIZE_SIM + : CLIENT_KNOBS->LOCATION_CACHE_EVICTION_SIZE; + + getValueSubmitted.init("NativeAPI.GetValueSubmitted"_sr); + getValueCompleted.init("NativeAPI.GetValueCompleted"_sr); + + clientDBInfoMonitor = monitorClientDBInfoChange(this, clientInfo, &proxiesChangeTrigger); + tssMismatchHandler = handleTssMismatches(this); + clientStatusUpdater.actor = clientStatusUpdateActor(this); + + smoothMidShardSize.reset(CLIENT_KNOBS->INIT_MID_SHARD_BYTES); + globalConfig = std::make_unique(this); + + if (apiVersion.version() >= 740) { + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::METRICS, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique( + singleKeyRange("fault_tolerance_metrics_json"_sr) + .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::METRICS).begin))); + } + + if (apiVersion.version() >= 700) { + registerSpecialKeysImpl(SpecialKeySpace::MODULE::ERRORMSG, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique( + SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ERRORMSG).begin, + [](ReadYourWritesTransaction* ryw) -> Future> { + if (ryw->getSpecialKeySpaceErrorMsg().present()) + return Optional(ryw->getSpecialKeySpaceErrorMsg().get()); + else + return Optional(); + }, + true)); + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique( + KeyRangeRef("options/"_sr, "options0"_sr) + .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique(SpecialKeySpace::getManagementApiCommandRange("exclude"))); + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique(SpecialKeySpace::getManagementApiCommandRange("failed"))); + registerSpecialKeysImpl(SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique( + SpecialKeySpace::getManagementApiCommandRange("excludedlocality"))); + registerSpecialKeysImpl(SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique( + SpecialKeySpace::getManagementApiCommandRange("failedlocality"))); + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique( + KeyRangeRef("in_progress_exclusion/"_sr, "in_progress_exclusion0"_sr) + .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::CONFIGURATION, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique( + KeyRangeRef("process/class_type/"_sr, "process/class_type0"_sr) + .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin))); + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::CONFIGURATION, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique( + KeyRangeRef("process/class_source/"_sr, "process/class_source0"_sr) + .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin))); + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique( + singleKeyRange("db_locked"_sr) + .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique( + singleKeyRange("consistency_check_suspended"_sr) + .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::GLOBALCONFIG, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG))); + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::TRACING, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::TRACING))); + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::CONFIGURATION, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique( + KeyRangeRef("coordinators/"_sr, "coordinators0"_sr) + .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin))); + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique( + singleKeyRange("auto_coordinators"_sr) + .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique( + singleKeyRange("min_required_commit_version"_sr) + .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique( + singleKeyRange("version_epoch"_sr) + .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique( + KeyRangeRef("profiling/"_sr, "profiling0"_sr) + .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)), + /* deprecated */ ApiVersion::withClientProfilingDeprecated().version()); + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique( + KeyRangeRef("maintenance/"_sr, "maintenance0"_sr) + .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique( + KeyRangeRef("data_distribution/"_sr, "data_distribution0"_sr) + .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::ACTORLINEAGE, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE))); + registerSpecialKeysImpl(SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique( + SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF))); + } + if (apiVersion.version() >= 630) { + registerSpecialKeysImpl(SpecialKeySpace::MODULE::TRANSACTION, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique(conflictingKeysRange)); + registerSpecialKeysImpl(SpecialKeySpace::MODULE::TRANSACTION, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique(readConflictRangeKeysRange)); + registerSpecialKeysImpl(SpecialKeySpace::MODULE::TRANSACTION, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique(writeConflictRangeKeysRange)); + registerSpecialKeysImpl(SpecialKeySpace::MODULE::METRICS, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique(ddStatsRange)); + registerSpecialKeysImpl(SpecialKeySpace::MODULE::METRICS, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique( + KeyRangeRef("\xff\xff/metrics/health/"_sr, "\xff\xff/metrics/health0"_sr))); + registerSpecialKeysImpl(SpecialKeySpace::MODULE::WORKERINTERFACE, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique( + KeyRangeRef("\xff\xff/worker_interfaces/"_sr, "\xff\xff/worker_interfaces0"_sr))); + registerSpecialKeysImpl(SpecialKeySpace::MODULE::STATUSJSON, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique( + "\xff\xff/status/json"_sr, + [](ReadYourWritesTransaction* ryw) -> Future> { + if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) { + ++ryw->getDatabase()->transactionStatusRequests; + return getJSON(ryw->getDatabase()); + } else { + return Optional(); + } + }, + true)); + registerSpecialKeysImpl(SpecialKeySpace::MODULE::CLUSTERFILEPATH, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique( + "\xff\xff/cluster_file_path"_sr, + [](ReadYourWritesTransaction* ryw) -> Future> { + try { + if (ryw->getDatabase().getPtr() && + ryw->getDatabase()->getConnectionRecord()) { + Optional output = + StringRef(ryw->getDatabase()->getConnectionRecord()->getLocation()); + return output; + } + } catch (Error& e) { + return e; + } + return Optional(); + }, + true)); + + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::CONNECTIONSTRING, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique( + "\xff\xff/connection_string"_sr, + [](ReadYourWritesTransaction* ryw) -> Future> { + try { + if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) { + Reference f = ryw->getDatabase()->getConnectionRecord(); + Optional output = StringRef(f->getConnectionString().toString()); + return output; + } + } catch (Error& e) { + return e; + } + return Optional(); + }, + true)); + registerSpecialKeysImpl(SpecialKeySpace::MODULE::CLUSTERID, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique( + "\xff\xff/cluster_id"_sr, + [](ReadYourWritesTransaction* ryw) -> Future> { + try { + if (ryw->getDatabase().getPtr()) { + return map(getClusterId(ryw->getDatabase()), [](UID id) { + return Optional(StringRef(id.toString())); + }); + } + } catch (Error& e) { + return e; + } + return Optional(); + }, + true)); + + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique(SpecialKeySpace::getManagementApiCommandRange("tenant"))); + } + throttleExpirer = recurring([this]() { expireThrottles(); }, CLIENT_KNOBS->TAG_THROTTLE_EXPIRATION_INTERVAL); + + if (BUGGIFY) { + DatabaseContext::debugUseTags = true; + } + + initializeSpecialCounters(); +} + +DatabaseContext::DatabaseContext(const Error& err) + : deferredError(err), internal(IsInternal::False), cc("TransactionMetrics"), + transactionReadVersions("ReadVersions", cc), transactionReadVersionsThrottled("ReadVersionsThrottled", cc), + transactionReadVersionsCompleted("ReadVersionsCompleted", cc), + transactionReadVersionBatches("ReadVersionBatches", cc), + transactionBatchReadVersions("BatchPriorityReadVersions", cc), + transactionDefaultReadVersions("DefaultPriorityReadVersions", cc), + transactionImmediateReadVersions("ImmediatePriorityReadVersions", cc), + transactionBatchReadVersionsCompleted("BatchPriorityReadVersionsCompleted", cc), + transactionDefaultReadVersionsCompleted("DefaultPriorityReadVersionsCompleted", cc), + transactionImmediateReadVersionsCompleted("ImmediatePriorityReadVersionsCompleted", cc), + transactionLogicalReads("LogicalUncachedReads", cc), transactionPhysicalReads("PhysicalReadRequests", cc), + transactionPhysicalReadsCompleted("PhysicalReadRequestsCompleted", cc), + transactionGetKeyRequests("GetKeyRequests", cc), transactionGetValueRequests("GetValueRequests", cc), + transactionGetRangeRequests("GetRangeRequests", cc), + transactionGetMappedRangeRequests("GetMappedRangeRequests", cc), + transactionGetRangeStreamRequests("GetRangeStreamRequests", cc), transactionWatchRequests("WatchRequests", cc), + transactionGetAddressesForKeyRequests("GetAddressesForKeyRequests", cc), transactionBytesRead("BytesRead", cc), + transactionKeysRead("KeysRead", cc), transactionMetadataVersionReads("MetadataVersionReads", cc), + transactionCommittedMutations("CommittedMutations", cc), + transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionSetMutations("SetMutations", cc), + transactionClearMutations("ClearMutations", cc), transactionAtomicMutations("AtomicMutations", cc), + transactionsCommitStarted("CommitStarted", cc), transactionsCommitCompleted("CommitCompleted", cc), + transactionKeyServerLocationRequests("KeyServerLocationRequests", cc), + transactionKeyServerLocationRequestsCompleted("KeyServerLocationRequestsCompleted", cc), + transactionStatusRequests("StatusRequests", cc), transactionTenantLookupRequests("TenantLookupRequests", cc), + transactionTenantLookupRequestsCompleted("TenantLookupRequestsCompleted", cc), transactionsTooOld("TooOld", cc), + transactionsFutureVersions("FutureVersions", cc), transactionsNotCommitted("NotCommitted", cc), + transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), + transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc), + transactionsLockRejected("LockRejected", cc), + transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc), + transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc), + transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), latencies(), readLatencies(), + commitLatencies(), GRVLatencies(), mutationsPerCommit(), bytesPerCommit(), sharedStatePtr(nullptr), + transactionTracingSample(false), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), + connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())), outstandingWatches(0) { + initializeSpecialCounters(); +} + +// Static constructor used by server processes to create a DatabaseContext +// For internal (fdbserver) use only +Database DatabaseContext::create(Reference> clientInfo, + Future clientInfoMonitor, + LocalityData clientLocality, + EnableLocalityLoadBalance enableLocalityLoadBalance, + TaskPriority taskID, + LockAware lockAware, + int apiVersion, + IsSwitchable switchable) { + return Database(new DatabaseContext(Reference>>(), + clientInfo, + makeReference>>(), + clientInfoMonitor, + taskID, + clientLocality, + enableLocalityLoadBalance, + lockAware, + IsInternal::True, + apiVersion, + switchable)); +} + +DatabaseContext::~DatabaseContext() { + clientDBInfoMonitor.cancel(); + monitorTssInfoChange.cancel(); + tssMismatchHandler.cancel(); + storage = nullptr; + + if (grvUpdateHandler.isValid()) { + grvUpdateHandler.cancel(); + } + if (sharedStatePtr) { + sharedStatePtr->delRef(sharedStatePtr); + } + for (auto it = server_interf.begin(); it != server_interf.end(); it = server_interf.erase(it)) + it->second->notifyContextDestroyed(); + ASSERT_ABORT(server_interf.empty()); + locationCache.insert(allKeys, Reference()); + + DisabledTraceEvent("DatabaseContextDestructed", dbId).backtrace(); +} diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 33bd1a4d59f..08c002df922 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1953,8 +1953,13 @@ Optional DatabaseContext::getCachedLocation(const TenantIn auto range = isBackward ? locationCache.rangeContainingKeyBefore(resolvedKey) : locationCache.rangeContaining(resolvedKey); - if (range->value()) { - return KeyRangeLocationInfo(toPrefixRelativeRange(range->range(), tenant.prefix), range->value()); + auto& loc = range->value(); + if (loc) { + // Cache hit: extend expiration time if refresh knob is set + if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME > 0.0 && loc->expireTime > 0.0) { + loc->expireTime = now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME; + } + return KeyRangeLocationInfo(toPrefixRelativeRange(range->range(), tenant.prefix), loc); } return Optional(); @@ -1984,6 +1989,10 @@ bool DatabaseContext::getCachedLocations(const TenantInfo& tenant, result.clear(); return false; } + // Cache hit: extend expiration time if refresh knob is set + if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME > 0.0 && r->value()->expireTime > 0.0) { + r->value()->expireTime = now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME; + } result.emplace_back(toPrefixRelativeRange(r->range() & resolvedRange, tenant.prefix), r->value()); if (result.size() == limit || begin == end) { break; @@ -2008,6 +2017,7 @@ Reference DatabaseContext::setCachedLocation(const KeyRangeRef& ab int maxEvictionAttempts = 100, attempts = 0; auto loc = makeReference(serverRefs); + // TODO: ideally remove based on TTL expiration times, instead of random while (locationCache.size() > locationCacheSize && attempts < maxEvictionAttempts) { CODE_PROBE(true, "NativeAPI storage server locationCache entry evicted"); attempts++; diff --git a/fdbclient/include/fdbclient/ClientKnobs.h b/fdbclient/include/fdbclient/ClientKnobs.h index 6847d5c5d55..ec735bf185d 100644 --- a/fdbclient/include/fdbclient/ClientKnobs.h +++ b/fdbclient/include/fdbclient/ClientKnobs.h @@ -94,6 +94,12 @@ class ClientKnobs : public KnobsImpl { int LOCATION_CACHE_EVICTION_SIZE_SIM; double LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD; double LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL; + // If > 0, each key-location cache entry expires this many seconds after insertion. + // Default 0 disables TTL expiration and keeps current behavior. + double LOCATION_CACHE_ENTRY_TTL; + // If > 0, extend the expireTime by this many seconds when a cached entry is used (cache hit). + // Only has effect when LOCATION_CACHE_ENTRY_TTL > 0. + double LOCATION_CACHE_ENTRY_REFRESH_TIME; int GET_RANGE_SHARD_LIMIT; int WARM_RANGE_SHARD_LIMIT; diff --git a/fdbclient/include/fdbclient/DatabaseContext.h b/fdbclient/include/fdbclient/DatabaseContext.h index c26e50968f5..bda6e9e1266 100644 --- a/fdbclient/include/fdbclient/DatabaseContext.h +++ b/fdbclient/include/fdbclient/DatabaseContext.h @@ -64,15 +64,22 @@ class StorageServerInfo : public ReferencedInterface { struct LocationInfo : MultiInterface>, FastAllocated { using Locations = MultiInterface>; explicit LocationInfo(const std::vector>>& v) - : Locations(v) {} + : Locations(v), + expireTime(CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL > 0.0 ? now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL + : 0.0) {} LocationInfo(const std::vector>>& v, bool hasCaches) - : Locations(v), hasCaches(hasCaches) {} + : Locations(v), hasCaches(hasCaches), + expireTime(CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL > 0.0 ? now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL + : 0.0) {} LocationInfo(const LocationInfo&) = delete; LocationInfo(LocationInfo&&) = delete; LocationInfo& operator=(const LocationInfo&) = delete; LocationInfo& operator=(LocationInfo&&) = delete; - bool hasCaches = false; Reference locations() { return Reference::addRef(this); } + + bool hasCaches = false; + // Absolute expiration time for this cache entry. 0 means no expiration (TTL disabled). + double expireTime = 0.0; }; using CommitProxyInfo = ModelInterface; From e1e66ace304aff6cb7367324f98493f9408a8314 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 24 Oct 2025 12:24:13 -0700 Subject: [PATCH 2/7] Add the actor to clean up expired location cache entries --- fdbclient/ClientKnobs.cpp | 2 + fdbclient/DatabaseContext.actor.cpp | 42 +++++++++++++++++++ fdbclient/include/fdbclient/ClientKnobs.h | 3 ++ fdbclient/include/fdbclient/DatabaseContext.h | 1 + 4 files changed, 48 insertions(+) diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index 4d2e2a966cd..c85d10a43ce 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -101,6 +101,8 @@ void ClientKnobs::initialize(Randomize randomize) { init( LOCATION_CACHE_ENTRY_TTL, 0.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_TTL = deterministicRandom()->randomInt(10, 60); // When cache entry is used, extend its expiration by this amount (sliding window) init( LOCATION_CACHE_ENTRY_REFRESH_TIME, 300.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_REFRESH_TIME = deterministicRandom()->randomInt(10, 60); + // Run location cache cleanup every 60 seconds when TTL is enabled + init( LOCATION_CACHE_EVICTION_INTERVAL, 60.0 ); init( GET_RANGE_SHARD_LIMIT, 2 ); init( WARM_RANGE_SHARD_LIMIT, 100 ); diff --git a/fdbclient/DatabaseContext.actor.cpp b/fdbclient/DatabaseContext.actor.cpp index e68f2bcc9c7..4f6ba47d9d0 100644 --- a/fdbclient/DatabaseContext.actor.cpp +++ b/fdbclient/DatabaseContext.actor.cpp @@ -972,6 +972,47 @@ void updateLocationCacheWithCaches(DatabaseContext* self, } } +ACTOR static Future cleanupLocationCache(DatabaseContext* cx) { + // Only run cleanup if TTL is enabled + if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL == 0.0) { + return Void(); + } + + loop { + wait(delay(CLIENT_KNOBS->LOCATION_CACHE_EVICTION_INTERVAL)); + + double currentTime = now(); + std::vector toRemove; + int totalCount = 0; + + // Scan locationCache for expired entries + auto iter = cx->locationCache.randomRange(); + for (; iter != cx->locationCache.end(); ++iter) { + if (iter->value() && iter->value()->hasCaches) { + // Check the expireTime of the first cache entry as a representative + // All entries in a range typically have similar expiration times + if (iter->value()->locations()->expireTime > 0.0 && + iter->value()->locations()->expireTime <= currentTime) { + toRemove.push_back(iter->range()); + } + } + totalCount++; + if (totalCount > 1000 || toRemove.size() > 100) { + break; // Avoid long blocking scans + } + } + + // Remove expired entries + for (const auto& range : toRemove) { + cx->locationCache.insert(range, Reference()); + } + + if (!toRemove.empty()) { + TraceEvent("LocationCacheCleanup").detail("RemovedRanges", toRemove.size()); + } + } +} + ACTOR static Future handleTssMismatches(DatabaseContext* cx) { state Reference tr; state KeyBackedMap tssMapDB = KeyBackedMap(tssMappingKeys.begin); @@ -1255,6 +1296,7 @@ DatabaseContext::DatabaseContext(ReferenceINIT_MID_SHARD_BYTES); diff --git a/fdbclient/include/fdbclient/ClientKnobs.h b/fdbclient/include/fdbclient/ClientKnobs.h index ec735bf185d..ba0d5440e03 100644 --- a/fdbclient/include/fdbclient/ClientKnobs.h +++ b/fdbclient/include/fdbclient/ClientKnobs.h @@ -100,6 +100,9 @@ class ClientKnobs : public KnobsImpl { // If > 0, extend the expireTime by this many seconds when a cached entry is used (cache hit). // Only has effect when LOCATION_CACHE_ENTRY_TTL > 0. double LOCATION_CACHE_ENTRY_REFRESH_TIME; + // How often to run the background actor that removes expired location cache entries. + // Only has effect when LOCATION_CACHE_ENTRY_TTL > 0. Default 60 seconds. + double LOCATION_CACHE_EVICTION_INTERVAL; int GET_RANGE_SHARD_LIMIT; int WARM_RANGE_SHARD_LIMIT; diff --git a/fdbclient/include/fdbclient/DatabaseContext.h b/fdbclient/include/fdbclient/DatabaseContext.h index bda6e9e1266..dd0ebfbca76 100644 --- a/fdbclient/include/fdbclient/DatabaseContext.h +++ b/fdbclient/include/fdbclient/DatabaseContext.h @@ -502,6 +502,7 @@ class DatabaseContext : public ReferenceCounted, public FastAll Future tssMismatchHandler; PromiseStream>> tssMismatchStream; Future grvUpdateHandler; + Future locationCacheCleanup; Reference commitProxies; Reference grvProxies; bool proxyProvisional; // Provisional commit proxy and grv proxy are used at the same time. From 5b823d35592811a6a54bf5c059ebfc8b09c60645 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 24 Oct 2025 14:06:34 -0700 Subject: [PATCH 3/7] Add code probe for location cache cleanups --- fdbclient/DatabaseContext.actor.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbclient/DatabaseContext.actor.cpp b/fdbclient/DatabaseContext.actor.cpp index 4f6ba47d9d0..72445bda29e 100644 --- a/fdbclient/DatabaseContext.actor.cpp +++ b/fdbclient/DatabaseContext.actor.cpp @@ -987,12 +987,11 @@ ACTOR static Future cleanupLocationCache(DatabaseContext* cx) { // Scan locationCache for expired entries auto iter = cx->locationCache.randomRange(); - for (; iter != cx->locationCache.end(); ++iter) { + for (; iter != cx->locationCache.lastItem(); ++iter) { if (iter->value() && iter->value()->hasCaches) { // Check the expireTime of the first cache entry as a representative // All entries in a range typically have similar expiration times - if (iter->value()->locations()->expireTime > 0.0 && - iter->value()->locations()->expireTime <= currentTime) { + if (iter->value()->expireTime > 0.0 && iter->value()->expireTime <= currentTime) { toRemove.push_back(iter->range()); } } @@ -1008,6 +1007,7 @@ ACTOR static Future cleanupLocationCache(DatabaseContext* cx) { } if (!toRemove.empty()) { + CODE_PROBE(true, "LocationCacheCleanup removed some entries"); TraceEvent("LocationCacheCleanup").detail("RemovedRanges", toRemove.size()); } } From 7c37d03261a5e8863a306a23b75270cb4246c00e Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Wed, 19 Nov 2025 16:55:12 +0100 Subject: [PATCH 4/7] Remove storage cache related check and add client knob for cache checks --- fdbclient/ClientKnobs.cpp | 11 +- fdbclient/DatabaseContext.actor.cpp | 1622 --------------------- fdbclient/NativeAPI.actor.cpp | 112 +- fdbclient/include/fdbclient/ClientKnobs.h | 6 + 4 files changed, 123 insertions(+), 1628 deletions(-) delete mode 100644 fdbclient/DatabaseContext.actor.cpp diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index c85d10a43ce..e5845406719 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -98,11 +98,16 @@ void ClientKnobs::initialize(Randomize randomize) { init( LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD, 60 ); init( LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL, 60 ); // TTL disabled by default to preserve existing behavior; set > 0 to enable - init( LOCATION_CACHE_ENTRY_TTL, 0.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_TTL = deterministicRandom()->randomInt(10, 60); + init( LOCATION_CACHE_ENTRY_TTL, 0.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_TTL = deterministicRandom()->randomInt(10, 60); // When cache entry is used, extend its expiration by this amount (sliding window) - init( LOCATION_CACHE_ENTRY_REFRESH_TIME, 300.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_REFRESH_TIME = deterministicRandom()->randomInt(10, 60); + init( LOCATION_CACHE_ENTRY_REFRESH_TIME, 300.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_REFRESH_TIME = deterministicRandom()->randomInt(10, 60); // Run location cache cleanup every 60 seconds when TTL is enabled - init( LOCATION_CACHE_EVICTION_INTERVAL, 60.0 ); + init( LOCATION_CACHE_EVICTION_INTERVAL, 60.0 ); + // The maximum entries per cache evition iteration to check if they are expired. If set to a negative number all entries will be validated. + init( LOCATION_CACHE_MAX_ENTRIES_PER_ITERATION, 1000.0 ); + // The maximum entires per cache evition iteration to remove. If set to a negative number all expired cache entries will be removed. + init( LOCATION_CACHE_MAX_REMOVED_ENTRIES_PER_ITERATION, 60.0 ); + init( GET_RANGE_SHARD_LIMIT, 2 ); init( WARM_RANGE_SHARD_LIMIT, 100 ); diff --git a/fdbclient/DatabaseContext.actor.cpp b/fdbclient/DatabaseContext.actor.cpp deleted file mode 100644 index 72445bda29e..00000000000 --- a/fdbclient/DatabaseContext.actor.cpp +++ /dev/null @@ -1,1622 +0,0 @@ -/* - * DatabaseContext.cpp - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2024 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// TODO: prune down the list of includes. This was copied from NativeAPI.actor.cpp. -#include "fdbclient/NativeAPI.actor.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "boost/algorithm/string.hpp" - -#include "fdbclient/Knobs.h" -#include "flow/CodeProbe.h" -#include "fmt/format.h" - -#include "fdbclient/FDBOptions.g.h" -#include "fdbclient/FDBTypes.h" -#include "fdbrpc/FailureMonitor.h" -#include "fdbrpc/MultiInterface.h" -#include "fdbrpc/TenantInfo.h" - -#include "fdbclient/ActorLineageProfiler.h" -#include "fdbclient/AnnotateActor.h" -#include "fdbclient/Atomic.h" -#include "fdbclient/ClusterInterface.h" -#include "fdbclient/ClusterConnectionFile.h" -#include "fdbclient/ClusterConnectionMemoryRecord.h" -#include "fdbclient/CoordinationInterface.h" -#include "fdbclient/CommitTransaction.h" -#include "fdbclient/DatabaseContext.h" -#include "fdbclient/GlobalConfig.actor.h" -#include "fdbclient/IKnobCollection.h" -#include "fdbclient/JsonBuilder.h" -#include "fdbclient/KeyBackedTypes.actor.h" -#include "fdbclient/KeyRangeMap.h" -#include "fdbclient/ManagementAPI.actor.h" -#include "fdbclient/NameLineage.h" -#include "fdbclient/CommitProxyInterface.h" -#include "fdbclient/MonitorLeader.h" -#include "fdbclient/MutationList.h" -#include "fdbclient/ParallelStream.actor.h" -#include "fdbclient/ReadYourWrites.h" -#include "fdbclient/SpecialKeySpace.actor.h" -#include "fdbclient/StorageServerInterface.h" -#include "fdbclient/SystemData.h" -#include "fdbclient/Tenant.h" -#include "fdbclient/TenantSpecialKeys.actor.h" -#include "fdbclient/TransactionLineage.h" -#include "fdbclient/versions.h" -#include "fdbrpc/WellKnownEndpoints.h" -#include "fdbrpc/LoadBalance.h" -#include "fdbrpc/Net2FileSystem.h" -#include "fdbrpc/simulator.h" -#include "fdbrpc/sim_validation.h" -#include "flow/Arena.h" -#include "flow/ActorCollection.h" -#include "flow/DeterministicRandom.h" -#include "flow/Error.h" -#include "flow/FastRef.h" -#include "flow/GetSourceVersion.h" -#include "flow/IRandom.h" -#include "flow/Trace.h" -#include "flow/ProtocolVersion.h" -#include "flow/flow.h" -#include "flow/genericactors.actor.h" -#include "flow/Knobs.h" -#include "flow/Platform.h" -#include "flow/SystemMonitor.h" -#include "flow/TLSConfig.actor.h" -#include "fdbclient/Tracing.h" -#include "flow/UnitTest.h" -#include "flow/network.h" -#include "flow/serialize.h" - -#ifdef ADDRESS_SANITIZER -#include -#endif - -#ifdef WIN32 -#define WIN32_LEAN_AND_MEAN -#include -#undef min -#undef max -#else -#include -#endif -#include "flow/actorcompiler.h" // This must be the last #include. - -Reference DatabaseContext::getWatchMetadata(int64_t tenantId, KeyRef key) const { - const auto it = watchMap.find(std::make_pair(tenantId, key)); - if (it == watchMap.end()) - return Reference(); - return it->second; -} - -void DatabaseContext::setWatchMetadata(Reference metadata) { - const WatchMapKey key(metadata->parameters->tenant.tenantId, metadata->parameters->key); - watchMap[key] = metadata; - // NOTE Here we do *NOT* update/reset the reference count for the key, see the source code in getWatchFuture. - // Basically the reference count could be increased, or the same watch is refreshed, or the watch might be cancelled -} - -int32_t DatabaseContext::increaseWatchRefCount(const int64_t tenantID, KeyRef key, const Version& version) { - const WatchMapKey mapKey(tenantID, key); - watchCounterMap[mapKey].insert(version); - return watchCounterMap[mapKey].size(); -} - -int32_t DatabaseContext::decreaseWatchRefCount(const int64_t tenantID, KeyRef key, const Version& version) { - const WatchMapKey mapKey(tenantID, key); - auto mapKeyIter = watchCounterMap.find(mapKey); - if (mapKeyIter == std::end(watchCounterMap)) { - // Key does not exist. The metadata might be removed by deleteWatchMetadata already. - return 0; - } - - auto& versionSet = mapKeyIter->second; - auto versionIter = versionSet.find(version); - - if (versionIter == std::end(versionSet)) { - // Version not found, the watch might be cleared before. - return versionSet.size(); - } - versionSet.erase(versionIter); - - const auto count = versionSet.size(); - // The metadata might be deleted somewhere else, before calling this decreaseWatchRefCount - if (auto metadata = getWatchMetadata(tenantID, key); metadata.isValid() && versionSet.size() == 0) { - // It is a *must* to cancel the watchFutureSS manually. watchFutureSS waits for watchStorageServerResp, which - // holds a reference to the metadata. If the ACTOR is not cancelled, it indirectly holds a Future waiting for - // itself. - metadata->watchFutureSS.cancel(); - deleteWatchMetadata(tenantID, key); - } - - return count; -} - -void DatabaseContext::deleteWatchMetadata(int64_t tenantId, KeyRef key, bool removeReferenceCount) { - const WatchMapKey mapKey(tenantId, key); - watchMap.erase(mapKey); - if (removeReferenceCount) { - watchCounterMap.erase(mapKey); - } -} - -void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi) { - auto result = tssMapping.find(ssi.id()); - // Update tss endpoint mapping if ss isn't in mapping, or the interface it mapped to changed - if (result == tssMapping.end() || - result->second.getValue.getEndpoint().token.first() != tssi.getValue.getEndpoint().token.first()) { - Reference metrics; - if (result == tssMapping.end()) { - // new TSS pairing - metrics = makeReference(); - tssMetrics[tssi.id()] = metrics; - tssMapping[ssi.id()] = tssi; - } else { - ASSERT(result->second.id() == tssi.id()); - metrics = tssMetrics[tssi.id()]; - result->second = tssi; - } - - // data requests duplicated for load and data comparison - queueModel.updateTssEndpoint(ssi.getValue.getEndpoint().token.first(), - TSSEndpointData(tssi.id(), tssi.getValue.getEndpoint(), metrics)); - queueModel.updateTssEndpoint(ssi.getKey.getEndpoint().token.first(), - TSSEndpointData(tssi.id(), tssi.getKey.getEndpoint(), metrics)); - queueModel.updateTssEndpoint(ssi.getKeyValues.getEndpoint().token.first(), - TSSEndpointData(tssi.id(), tssi.getKeyValues.getEndpoint(), metrics)); - queueModel.updateTssEndpoint(ssi.getMappedKeyValues.getEndpoint().token.first(), - TSSEndpointData(tssi.id(), tssi.getMappedKeyValues.getEndpoint(), metrics)); - queueModel.updateTssEndpoint(ssi.getKeyValuesStream.getEndpoint().token.first(), - TSSEndpointData(tssi.id(), tssi.getKeyValuesStream.getEndpoint(), metrics)); - - // non-data requests duplicated for load - queueModel.updateTssEndpoint(ssi.watchValue.getEndpoint().token.first(), - TSSEndpointData(tssi.id(), tssi.watchValue.getEndpoint(), metrics)); - queueModel.updateTssEndpoint(ssi.splitMetrics.getEndpoint().token.first(), - TSSEndpointData(tssi.id(), tssi.splitMetrics.getEndpoint(), metrics)); - queueModel.updateTssEndpoint(ssi.getReadHotRanges.getEndpoint().token.first(), - TSSEndpointData(tssi.id(), tssi.getReadHotRanges.getEndpoint(), metrics)); - queueModel.updateTssEndpoint(ssi.getRangeSplitPoints.getEndpoint().token.first(), - TSSEndpointData(tssi.id(), tssi.getRangeSplitPoints.getEndpoint(), metrics)); - } -} - -void DatabaseContext::removeTssMapping(StorageServerInterface const& ssi) { - auto result = tssMapping.find(ssi.id()); - if (result != tssMapping.end()) { - tssMetrics.erase(ssi.id()); - tssMapping.erase(result); - queueModel.removeTssEndpoint(ssi.getValue.getEndpoint().token.first()); - queueModel.removeTssEndpoint(ssi.getKey.getEndpoint().token.first()); - queueModel.removeTssEndpoint(ssi.getKeyValues.getEndpoint().token.first()); - queueModel.removeTssEndpoint(ssi.getMappedKeyValues.getEndpoint().token.first()); - queueModel.removeTssEndpoint(ssi.getKeyValuesStream.getEndpoint().token.first()); - - queueModel.removeTssEndpoint(ssi.watchValue.getEndpoint().token.first()); - queueModel.removeTssEndpoint(ssi.splitMetrics.getEndpoint().token.first()); - queueModel.removeTssEndpoint(ssi.getReadHotRanges.getEndpoint().token.first()); - queueModel.removeTssEndpoint(ssi.getRangeSplitPoints.getEndpoint().token.first()); - } -} - -void DatabaseContext::addSSIdTagMapping(const UID& uid, const Tag& tag) { - ssidTagMapping[uid] = tag; -} - -void DatabaseContext::getLatestCommitVersionForSSID(const UID& ssid, Tag& tag, Version& commitVersion) { - tag = invalidTag; - commitVersion = invalidVersion; - - auto iter = ssidTagMapping.find(ssid); - if (iter != ssidTagMapping.end()) { - tag = iter->second; - - if (ssVersionVectorCache.hasVersion(tag)) { - commitVersion = ssVersionVectorCache.getVersion(tag); - } - } -} - -void DatabaseContext::getLatestCommitVersion(const StorageServerInterface& ssi, - Version readVersion, - VersionVector& latestCommitVersion) { - latestCommitVersion.clear(); - - if (ssVersionVectorCache.getMaxVersion() == invalidVersion) { - return; - } - - // Error checking (based on the assumption that the read version was not obtained - // from the client's grv cache). - if (readVersion > ssVersionVectorCache.getMaxVersion()) { - TraceEvent(SevError, "ReadVersionExceedsVersionVectorMax") - .detail("ReadVersion", readVersion) - .detail("VersionVector", ssVersionVectorCache.toString()); - if (g_network->isSimulated()) { - ASSERT(false); - } else { - return; // Do not return a stale commit version in production. - } - } - - Tag tag = invalidTag; - Version commitVersion = invalidVersion; - getLatestCommitVersionForSSID(ssi.id(), tag, commitVersion); - - if (tag != invalidTag && commitVersion != invalidVersion && commitVersion < readVersion) { - latestCommitVersion.setVersion(tag, commitVersion); - } -} - -void DatabaseContext::getLatestCommitVersions(const Reference& locationInfo, - Reference info, - VersionVector& latestCommitVersions) { - latestCommitVersions.clear(); - - if (info->readOptions.present() && info->readOptions.get().debugID.present()) { - g_traceBatch.addEvent( - "TransactionDebug", info->readOptions.get().debugID.get().first(), "NativeAPI.getLatestCommitVersions"); - } - - if (!info->readVersionObtainedFromGrvProxy) { - return; - } - - if (ssVersionVectorCache.getMaxVersion() == invalidVersion) { - return; - } - - if (info->readVersion() > ssVersionVectorCache.getMaxVersion()) { - if (!CLIENT_KNOBS->FORCE_GRV_CACHE_OFF && !info->options.skipGrvCache && info->options.useGrvCache) { - return; - } else { - TraceEvent(SevError, "GetLatestCommitVersions") - .detail("ReadVersion", info->readVersion()) - .detail("VersionVector", ssVersionVectorCache.toString()); - ASSERT(false); - } - } - - std::map> versionMap; // order the versions to be returned - for (int i = 0; i < locationInfo->locations()->size(); i++) { - Tag tag = invalidTag; - Version commitVersion = invalidVersion; // latest commit version - getLatestCommitVersionForSSID(locationInfo->locations()->getId(i), tag, commitVersion); - - bool updatedVersionMap = false; - if (tag != invalidTag && commitVersion != invalidVersion && commitVersion < info->readVersion()) { - updatedVersionMap = true; - versionMap[commitVersion].insert(tag); - } - - // Do not log if commitVersion >= readVersion. - if (!updatedVersionMap && commitVersion == invalidVersion) { - TraceEvent(SevDebug, "CommitVersionNotFoundForSS") - .detail("InSSIDMap", tag != invalidTag ? 1 : 0) - .detail("Tag", tag) - .detail("CommitVersion", commitVersion) - .detail("ReadVersion", info->readVersion()) - .detail("VersionVector", ssVersionVectorCache.toString()) - .setMaxEventLength(11000) - .setMaxFieldLength(10000); - ++transactionCommitVersionNotFoundForSS; - } - } - - // insert the commit versions in the version vector. - for (auto& iter : versionMap) { - latestCommitVersions.setVersion(iter.second, iter.first); - } -} - -void updateCachedReadVersionShared(double t, Version v, DatabaseSharedState* p) { - MutexHolder mutex(p->mutexLock); - if (v >= p->grvCacheSpace.cachedReadVersion) { - //TraceEvent(SevDebug, "CacheReadVersionUpdate") - // .detail("Version", v) - // .detail("CurTime", t) - // .detail("LastVersion", p->grvCacheSpace.cachedReadVersion) - // .detail("LastTime", p->grvCacheSpace.lastGrvTime); - p->grvCacheSpace.cachedReadVersion = v; - if (t > p->grvCacheSpace.lastGrvTime) { - p->grvCacheSpace.lastGrvTime = t; - } - } -} - -void DatabaseContext::updateCachedReadVersion(double t, Version v) { - if (sharedStatePtr) { - return updateCachedReadVersionShared(t, v, sharedStatePtr); - } - if (v >= cachedReadVersion) { - //TraceEvent(SevDebug, "CachedReadVersionUpdate") - // .detail("Version", v) - // .detail("GrvStartTime", t) - // .detail("LastVersion", cachedReadVersion) - // .detail("LastTime", lastGrvTime); - cachedReadVersion = v; - // Since the time is based on the start of the request, it's possible that we - // get a newer version with an older time. - // (Request started earlier, but was latest to reach the proxy) - // Only update time when strictly increasing (?) - if (t > lastGrvTime) { - lastGrvTime = t; - } - } -} - -Version DatabaseContext::getCachedReadVersion() { - if (sharedStatePtr) { - MutexHolder mutex(sharedStatePtr->mutexLock); - return sharedStatePtr->grvCacheSpace.cachedReadVersion; - } - return cachedReadVersion; -} - -double DatabaseContext::getLastGrvTime() { - if (sharedStatePtr) { - MutexHolder mutex(sharedStatePtr->mutexLock); - return sharedStatePtr->grvCacheSpace.lastGrvTime; - } - return lastGrvTime; -} - -Reference StorageServerInfo::getInterface(DatabaseContext* cx, - StorageServerInterface const& ssi, - LocalityData const& locality) { - auto it = cx->server_interf.find(ssi.id()); - if (it != cx->server_interf.end()) { - if (it->second->interf.getValue.getEndpoint().token != ssi.getValue.getEndpoint().token) { - if (it->second->interf.locality == ssi.locality) { - // FIXME: load balance holds pointers to individual members of the interface, and this assignment will - // swap out the object they are - // pointing to. This is technically correct, but is very unnatural. We may want to refactor load - // balance to take an AsyncVar> so that it is notified when the interface - // changes. - - it->second->interf = ssi; - } else { - it->second->notifyContextDestroyed(); - Reference loc(new StorageServerInfo(cx, ssi, locality)); - cx->server_interf[ssi.id()] = loc.getPtr(); - return loc; - } - } - - return Reference::addRef(it->second); - } - - Reference loc(new StorageServerInfo(cx, ssi, locality)); - cx->server_interf[ssi.id()] = loc.getPtr(); - return loc; -} - -void StorageServerInfo::notifyContextDestroyed() { - cx = nullptr; -} - -StorageServerInfo::~StorageServerInfo() { - if (cx) { - auto it = cx->server_interf.find(interf.id()); - if (it != cx->server_interf.end()) - cx->server_interf.erase(it); - cx = nullptr; - } -} - -void DatabaseContext::validateVersion(Version version) const { - // Version could be 0 if the INITIALIZE_NEW_DATABASE option is set. In that case, it is illegal to perform any - // reads. We throw client_invalid_operation because the caller didn't directly set the version, so the - // version_invalid error might be confusing. - if (version == 0) { - throw client_invalid_operation(); - } - if (switchable && version < minAcceptableReadVersion) { - CODE_PROBE(true, "Attempted to read a version lower than any this client has seen from the current cluster"); - throw transaction_too_old(); - } - - ASSERT(version > 0 || version == latestVersion); -} - -inline HealthMetrics populateHealthMetrics(const HealthMetrics& detailedMetrics, bool detailedOutput) { - if (detailedOutput) { - return detailedMetrics; - } else { - HealthMetrics result; - result.update(detailedMetrics, false, false); - return result; - } -} - -ACTOR static Future getHealthMetricsActor(DatabaseContext* cx, bool detailed, bool sendDetailedRequest) { - loop { - choose { - when(wait(cx->onProxiesChanged())) {} - when(GetHealthMetricsReply rep = wait(basicLoadBalance(cx->getGrvProxies(UseProvisionalProxies::False), - &GrvProxyInterface::getHealthMetrics, - GetHealthMetricsRequest(sendDetailedRequest)))) { - cx->healthMetrics.update(rep.healthMetrics, sendDetailedRequest, true); - cx->healthMetricsLastUpdated = now(); - if (sendDetailedRequest) { - cx->detailedHealthMetricsLastUpdated = now(); - } - return populateHealthMetrics(cx->healthMetrics, detailed); - } - } - } -} - -Future DatabaseContext::getHealthMetrics(bool detailed = false) { - if (now() - healthMetricsLastUpdated < CLIENT_KNOBS->AGGREGATE_HEALTH_METRICS_MAX_STALENESS) { - return populateHealthMetrics(healthMetrics, detailed); - } - bool sendDetailedRequest = - detailed && now() - detailedHealthMetricsLastUpdated > CLIENT_KNOBS->DETAILED_HEALTH_METRICS_MAX_STALENESS; - return getHealthMetricsActor(this, detailed, sendDetailedRequest); -} - -Future> DatabaseContext::getStorageStats(const UID& id, double maxStaleness) { - if (now() - detailedHealthMetricsLastUpdated < maxStaleness) { - auto it = healthMetrics.storageStats.find(id); - return it == healthMetrics.storageStats.end() ? Optional() : it->second; - } - - return map(getHealthMetricsActor(this, true, true), [&id](auto metrics) -> Optional { - auto it = metrics.storageStats.find(id); - return it == metrics.storageStats.end() ? Optional() : it->second; - }); -} - -// register a special key(s) implementation under the specified module -void DatabaseContext::registerSpecialKeysImpl(SpecialKeySpace::MODULE module, - SpecialKeySpace::IMPLTYPE type, - std::unique_ptr&& impl, - int deprecatedVersion) { - // if deprecated, add the implementation when the api version is less than the deprecated version - if (deprecatedVersion == -1 || apiVersion.version() < deprecatedVersion) { - specialKeySpace->registerKeyRange(module, type, impl->getKeyRange(), impl.get()); - specialKeySpaceModules.push_back(std::move(impl)); - } -} - -void traceTSSErrors(const char* name, UID tssId, const std::unordered_map& errorsByCode) { - TraceEvent ev(name, tssId); - for (auto& it : errorsByCode) { - ev.detail("E" + std::to_string(it.first), it.second); - } -} - -/* - For each request type, this will produce - Count - {SS,TSS}{Mean,P50,P90,P99} - Example: - GetValueLatencySSMean -*/ -void traceSSOrTSSPercentiles(TraceEvent& ev, const std::string name, DDSketch& sample) { - ev.detail(name + "Mean", sample.mean()); - // don't log the larger percentiles unless we actually have enough samples to log the accurate percentile instead of - // the largest sample in this window - if (sample.getPopulationSize() >= 3) { - ev.detail(name + "P50", sample.median()); - } - if (sample.getPopulationSize() >= 10) { - ev.detail(name + "P90", sample.percentile(0.90)); - } - if (sample.getPopulationSize() >= 100) { - ev.detail(name + "P99", sample.percentile(0.99)); - } -} - -void traceTSSPercentiles(TraceEvent& ev, - const std::string name, - DDSketch& ssSample, - DDSketch& tssSample) { - ASSERT(ssSample.getPopulationSize() == tssSample.getPopulationSize()); - ev.detail(name + "Count", ssSample.getPopulationSize()); - if (ssSample.getPopulationSize() > 0) { - traceSSOrTSSPercentiles(ev, name + "SS", ssSample); - traceSSOrTSSPercentiles(ev, name + "TSS", tssSample); - } -} - -ACTOR Future tssLogger(DatabaseContext* cx) { - state double lastLogged = 0; - loop { - wait(delay(CLIENT_KNOBS->TSS_METRICS_LOGGING_INTERVAL, TaskPriority::FlushTrace)); - - // Log each TSS pair separately - for (const auto& it : cx->tssMetrics) { - if (it.second->detailedMismatches.size()) { - cx->tssMismatchStream.send( - std::pair>(it.first, it.second->detailedMismatches)); - } - - // Do error histograms as separate event - if (it.second->ssErrorsByCode.size()) { - traceTSSErrors("TSS_SSErrors", it.first, it.second->ssErrorsByCode); - } - - if (it.second->tssErrorsByCode.size()) { - traceTSSErrors("TSS_TSSErrors", it.first, it.second->tssErrorsByCode); - } - - TraceEvent tssEv("TSSClientMetrics", cx->dbId); - tssEv.detail("TSSID", it.first) - .detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged) - .detail("Internal", cx->internal); - - it.second->cc.logToTraceEvent(tssEv); - - traceTSSPercentiles(tssEv, "GetValueLatency", it.second->SSgetValueLatency, it.second->TSSgetValueLatency); - traceTSSPercentiles( - tssEv, "GetKeyValuesLatency", it.second->SSgetKeyValuesLatency, it.second->TSSgetKeyValuesLatency); - traceTSSPercentiles(tssEv, "GetKeyLatency", it.second->SSgetKeyLatency, it.second->TSSgetKeyLatency); - traceTSSPercentiles(tssEv, - "GetMappedKeyValuesLatency", - it.second->SSgetMappedKeyValuesLatency, - it.second->TSSgetMappedKeyValuesLatency); - - it.second->clear(); - } - - lastLogged = now(); - } -} - -ACTOR Future databaseLogger(DatabaseContext* cx) { - state double lastLogged = 0; - loop { - wait(delay(CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskPriority::FlushTrace)); - - bool logMetrics = !g_network->isSimulated() || BUGGIFY_WITH_PROB(0.01); - if (logMetrics) { - TraceEvent ev("TransactionMetrics", cx->dbId); - - ev.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged) - .detail("Cluster", - cx->getConnectionRecord() - ? cx->getConnectionRecord()->getConnectionString().clusterKeyName().toString() - : "") - .detail("Internal", cx->internal); - - cx->cc.logToTraceEvent(ev); - - ev.detail("LocationCacheEntryCount", cx->locationCache.size()); - ev.detail("MeanLatency", cx->latencies.mean()) - .detail("MedianLatency", cx->latencies.median()) - .detail("Latency90", cx->latencies.percentile(0.90)) - .detail("Latency98", cx->latencies.percentile(0.98)) - .detail("MaxLatency", cx->latencies.max()) - .detail("MeanRowReadLatency", cx->readLatencies.mean()) - .detail("MedianRowReadLatency", cx->readLatencies.median()) - .detail("MaxRowReadLatency", cx->readLatencies.max()) - .detail("MeanGRVLatency", cx->GRVLatencies.mean()) - .detail("MedianGRVLatency", cx->GRVLatencies.median()) - .detail("MaxGRVLatency", cx->GRVLatencies.max()) - .detail("MeanCommitLatency", cx->commitLatencies.mean()) - .detail("MedianCommitLatency", cx->commitLatencies.median()) - .detail("MaxCommitLatency", cx->commitLatencies.max()) - .detail("MeanMutationsPerCommit", cx->mutationsPerCommit.mean()) - .detail("MedianMutationsPerCommit", cx->mutationsPerCommit.median()) - .detail("MaxMutationsPerCommit", cx->mutationsPerCommit.max()) - .detail("MeanBytesPerCommit", cx->bytesPerCommit.mean()) - .detail("MedianBytesPerCommit", cx->bytesPerCommit.median()) - .detail("MaxBytesPerCommit", cx->bytesPerCommit.max()) - .detail("NumLocalityCacheEntries", cx->locationCache.size()); - } - - cx->latencies.clear(); - cx->readLatencies.clear(); - cx->GRVLatencies.clear(); - cx->commitLatencies.clear(); - cx->mutationsPerCommit.clear(); - cx->bytesPerCommit.clear(); - - lastLogged = now(); - } -} - -struct TrInfoChunk { - ValueRef value; - Key key; -}; - -static const Key CLIENT_LATENCY_INFO_PREFIX = "client_latency/"_sr; -static const Key CLIENT_LATENCY_INFO_CTR_PREFIX = "client_latency_counter/"_sr; - -ACTOR static Future transactionInfoCommitActor(Transaction* tr, std::vector* chunks) { - state const Key clientLatencyAtomicCtr = CLIENT_LATENCY_INFO_CTR_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin); - state int retryCount = 0; - loop { - try { - tr->reset(); - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - state Future> vstamp = tr->getVersionstamp(); - int64_t numCommitBytes = 0; - for (auto& chunk : *chunks) { - tr->atomicOp(chunk.key, chunk.value, MutationRef::SetVersionstampedKey); - numCommitBytes += chunk.key.size() + chunk.value.size() - - 4; // subtract number of bytes of key that denotes version stamp index - } - tr->atomicOp(clientLatencyAtomicCtr, StringRef((uint8_t*)&numCommitBytes, 8), MutationRef::AddValue); - wait(tr->commit()); - return Void(); - } catch (Error& e) { - retryCount++; - if (retryCount == 10) - throw; - wait(tr->onError(e)); - } - } -} - -ACTOR static Future delExcessClntTxnEntriesActor(Transaction* tr, int64_t clientTxInfoSizeLimit) { - state const Key clientLatencyName = CLIENT_LATENCY_INFO_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin); - state const Key clientLatencyAtomicCtr = CLIENT_LATENCY_INFO_CTR_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin); - TraceEvent(SevInfo, "DelExcessClntTxnEntriesCalled").log(); - - // If we don't limit it with retries, the DatabaseContext will never cleanup as Transaction - // object will be alive and hold reference to DatabaseContext. - state int retries = 0; - loop { - try { - tr->reset(); - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - Optional ctrValue = wait(tr->get(KeyRef(clientLatencyAtomicCtr), Snapshot::True)); - if (!ctrValue.present()) { - TraceEvent(SevInfo, "NumClntTxnEntriesNotFound").log(); - return Void(); - } - state int64_t txInfoSize = 0; - ASSERT(ctrValue.get().size() == sizeof(int64_t)); - memcpy(&txInfoSize, ctrValue.get().begin(), ctrValue.get().size()); - if (txInfoSize < clientTxInfoSizeLimit) - return Void(); - int getRangeByteLimit = (txInfoSize - clientTxInfoSizeLimit) < CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT - ? (txInfoSize - clientTxInfoSizeLimit) - : CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT; - GetRangeLimits limit(GetRangeLimits::ROW_LIMIT_UNLIMITED, getRangeByteLimit); - RangeResult txEntries = - wait(tr->getRange(KeyRangeRef(clientLatencyName, strinc(clientLatencyName)), limit)); - state int64_t numBytesToDel = 0; - KeyRef endKey; - for (auto& kv : txEntries) { - endKey = kv.key; - numBytesToDel += kv.key.size() + kv.value.size(); - if (txInfoSize - numBytesToDel <= clientTxInfoSizeLimit) - break; - } - if (numBytesToDel) { - tr->clear(KeyRangeRef(txEntries[0].key, strinc(endKey))); - TraceEvent(SevInfo, "DeletingExcessCntTxnEntries").detail("BytesToBeDeleted", numBytesToDel); - int64_t bytesDel = -numBytesToDel; - - tr->atomicOp(clientLatencyAtomicCtr, StringRef((uint8_t*)&bytesDel, 8), MutationRef::AddValue); - wait(tr->commit()); - } - if (txInfoSize - numBytesToDel <= clientTxInfoSizeLimit) - return Void(); - } catch (Error& e) { - if (e.code() == error_code_actor_cancelled || retries++ >= 10) { - throw; - } - - wait(tr->onError(e)); - } - } -} - -// FIXME: explain what "client status" is -// The reason for getting a pointer to DatabaseContext instead of a reference counted object is because reference -// counting will increment reference count for DatabaseContext which holds the future of this actor. This creates a -// cyclic reference and hence this actor and Database object will not be destroyed at all. -ACTOR static Future clientStatusUpdateActor(DatabaseContext* cx) { - state const std::string clientLatencyName = - CLIENT_LATENCY_INFO_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin).toString(); - state Transaction tr; - state std::vector commitQ; - state int txBytes = 0; - - loop { - // Make sure we are connected to the server. Otherwise we may just try to keep reconnecting - // with incompatible clusters. - wait(cx->onConnected()); - - // Need to make sure that we eventually destroy tr. We can't rely on getting cancelled to do - // this because of the cyclic reference to self. - wait(refreshTransaction(cx, &tr)); - try { - ASSERT(cx->clientStatusUpdater.outStatusQ.empty()); - cx->clientStatusUpdater.inStatusQ.swap(cx->clientStatusUpdater.outStatusQ); - // Split Transaction Info into chunks - state std::vector trChunksQ; - for (auto& entry : cx->clientStatusUpdater.outStatusQ) { - auto& bw = entry.second; - int64_t value_size_limit = BUGGIFY - ? deterministicRandom()->randomInt(1e3, CLIENT_KNOBS->VALUE_SIZE_LIMIT) - : CLIENT_KNOBS->VALUE_SIZE_LIMIT; - int num_chunks = (bw.getLength() + value_size_limit - 1) / value_size_limit; - std::string random_id = deterministicRandom()->randomAlphaNumeric(16); - std::string user_provided_id = entry.first.size() ? entry.first + "/" : ""; - for (int i = 0; i < num_chunks; i++) { - TrInfoChunk chunk; - BinaryWriter chunkBW(Unversioned()); - chunkBW << bigEndian32(i + 1) << bigEndian32(num_chunks); - chunk.key = KeyRef(clientLatencyName + std::string(10, '\x00') + "/" + random_id + "/" + - chunkBW.toValue().toString() + "/" + user_provided_id + std::string(4, '\x00')); - int32_t pos = littleEndian32(clientLatencyName.size()); - memcpy(mutateString(chunk.key) + chunk.key.size() - sizeof(int32_t), &pos, sizeof(int32_t)); - if (i == num_chunks - 1) { - chunk.value = ValueRef(static_cast(bw.getData()) + (i * value_size_limit), - bw.getLength() - (i * value_size_limit)); - } else { - chunk.value = - ValueRef(static_cast(bw.getData()) + (i * value_size_limit), value_size_limit); - } - trChunksQ.push_back(std::move(chunk)); - } - } - - // Commit the chunks splitting into different transactions if needed - state int64_t dataSizeLimit = - BUGGIFY ? deterministicRandom()->randomInt(200e3, 1.5 * CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT) - : 0.8 * CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT; - state std::vector::iterator tracking_iter = trChunksQ.begin(); - ASSERT(commitQ.empty() && (txBytes == 0)); - loop { - state std::vector::iterator iter = tracking_iter; - txBytes = 0; - commitQ.clear(); - try { - while (iter != trChunksQ.end()) { - if (iter->value.size() + iter->key.size() + txBytes > dataSizeLimit) { - wait(transactionInfoCommitActor(&tr, &commitQ)); - tracking_iter = iter; - commitQ.clear(); - txBytes = 0; - } - commitQ.push_back(*iter); - txBytes += iter->value.size() + iter->key.size(); - ++iter; - } - if (!commitQ.empty()) { - wait(transactionInfoCommitActor(&tr, &commitQ)); - commitQ.clear(); - txBytes = 0; - } - break; - } catch (Error& e) { - if (e.code() == error_code_transaction_too_large) { - dataSizeLimit /= 2; - ASSERT(dataSizeLimit >= CLIENT_KNOBS->VALUE_SIZE_LIMIT + CLIENT_KNOBS->KEY_SIZE_LIMIT); - } else { - TraceEvent(SevWarnAlways, "ClientTrInfoErrorCommit").error(e).detail("TxBytes", txBytes); - commitQ.clear(); - txBytes = 0; - throw; - } - } - } - cx->clientStatusUpdater.outStatusQ.clear(); - wait(cx->globalConfig->onInitialized()); - double sampleRate = - cx->globalConfig->get(fdbClientInfoTxnSampleRate, std::numeric_limits::infinity()); - double clientSamplingProbability = - std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate; - int64_t sizeLimit = cx->globalConfig->get(fdbClientInfoTxnSizeLimit, -1); - int64_t clientTxnInfoSizeLimit = sizeLimit == -1 ? CLIENT_KNOBS->CSI_SIZE_LIMIT : sizeLimit; - if (!trChunksQ.empty() && deterministicRandom()->random01() < clientSamplingProbability) - wait(delExcessClntTxnEntriesActor(&tr, clientTxnInfoSizeLimit)); - - // Cleanup Transaction sooner than later, so that we don't hold reference to context. - tr = Transaction(); - wait(delay(CLIENT_KNOBS->CSI_STATUS_DELAY)); - } catch (Error& e) { - TraceEvent(SevWarnAlways, "UnableToWriteClientStatus").error(e); - if (e.code() == error_code_actor_cancelled) { - throw; - } - cx->clientStatusUpdater.outStatusQ.clear(); - - // Cleanup Transaction sooner than later, so that we don't hold reference to context. - tr = Transaction(); - wait(delay(10.0)); - } - } -} - -ACTOR Future assertFailure(GrvProxyInterface remote, Future> reply) { - try { - ErrorOr res = wait(reply); - if (!res.isError()) { - TraceEvent(SevError, "GotStaleReadVersion") - .detail("Remote", remote.getConsistentReadVersion.getEndpoint().addresses.address.toString()) - .detail("Provisional", remote.provisional) - .detail("ReadVersion", res.get().version); - ASSERT_WE_THINK(false); - } - } catch (Error& e) { - if (e.code() == error_code_actor_cancelled) { - throw; - } - // we want this to fail -- so getting here is good, we'll just ignore the error. - } - return Void(); -} - -Future attemptGRVFromOldProxies(std::vector oldProxies, - std::vector newProxies) { - auto debugID = nondeterministicRandom()->randomUniqueID(); - g_traceBatch.addEvent("AttemptGRVFromOldProxyDebug", debugID.first(), "NativeAPI.attemptGRVFromOldProxies.Start"); - Span span("NAPI:VerifyCausalReadRisky"_loc); - std::vector> replies; - replies.reserve(oldProxies.size()); - GetReadVersionRequest req( - span.context, 1, TransactionPriority::IMMEDIATE, GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY); - TraceEvent evt("AttemptGRVFromOldProxies"); - evt.detail("NumOldProxies", oldProxies.size()).detail("NumNewProxies", newProxies.size()); - auto traceProxies = [&](std::vector& proxies, std::string const& key) { - for (int i = 0; i < proxies.size(); ++i) { - auto k = key + std::to_string(i); - evt.detail(k.c_str(), proxies[i].id()); - } - }; - traceProxies(oldProxies, "OldProxy"s); - traceProxies(newProxies, "NewProxy"s); - evt.log(); - for (auto& i : oldProxies) { - req.reply = ReplyPromise(); - replies.push_back(assertFailure(i, i.getConsistentReadVersion.tryGetReply(req))); - } - return waitForAll(replies); -} - -ACTOR static Future monitorClientDBInfoChange(DatabaseContext* cx, - Reference const> clientDBInfo, - AsyncTrigger* proxiesChangeTrigger) { - state std::vector curCommitProxies; - state std::vector curGrvProxies; - state ActorCollection actors(false); - state Future clientDBInfoOnChange = clientDBInfo->onChange(); - curCommitProxies = clientDBInfo->get().commitProxies; - curGrvProxies = clientDBInfo->get().grvProxies; - - loop { - choose { - when(wait(clientDBInfoOnChange)) { - clientDBInfoOnChange = clientDBInfo->onChange(); - if (clientDBInfo->get().commitProxies != curCommitProxies || - clientDBInfo->get().grvProxies != curGrvProxies) { - // This condition is a bit complicated. Here we want to verify that we're unable to receive a read - // version from a proxy of an old generation after a successful recovery. The conditions are: - // 1. We only do this with a configured probability. - // 2. If the old set of Grv proxies is empty, there's nothing to do - // 3. If the new set of Grv proxies is empty, it means the recovery is not complete. So if an old - // Grv proxy still gives out read versions, this would be correct behavior. - // 4. If we see a provisional proxy, it means the recovery didn't complete yet, so the same as (3) - // applies. - if (deterministicRandom()->random01() < cx->verifyCausalReadsProp && !curGrvProxies.empty() && - !clientDBInfo->get().grvProxies.empty() && !clientDBInfo->get().grvProxies[0].provisional) { - actors.add(attemptGRVFromOldProxies(curGrvProxies, clientDBInfo->get().grvProxies)); - } - curCommitProxies = clientDBInfo->get().commitProxies; - curGrvProxies = clientDBInfo->get().grvProxies; - // Commits in the previous epoch may have been recovered but not included in the version vector. - // Clear the version vector to ensure the latest commit versions are received. - cx->ssVersionVectorCache.clear(); - proxiesChangeTrigger->trigger(); - } - } - when(wait(actors.getResult())) { - UNSTOPPABLE_ASSERT(false); - } - } - } -} - -void updateLocationCacheWithCaches(DatabaseContext* self, - const std::map& removed, - const std::map& added) { - // TODO: this needs to be more clever in the future - auto ranges = self->locationCache.ranges(); - for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) { - if (iter->value() && iter->value()->hasCaches) { - auto& val = iter->value(); - std::vector>> interfaces; - interfaces.reserve(val->size() - removed.size() + added.size()); - for (int i = 0; i < val->size(); ++i) { - const auto& interf = (*val)[i]; - if (removed.count(interf->interf.id()) == 0) { - interfaces.emplace_back(interf); - } - } - for (const auto& p : added) { - interfaces.push_back(makeReference>(p.second)); - } - iter->value() = makeReference(interfaces, true); - } - } -} - -ACTOR static Future cleanupLocationCache(DatabaseContext* cx) { - // Only run cleanup if TTL is enabled - if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL == 0.0) { - return Void(); - } - - loop { - wait(delay(CLIENT_KNOBS->LOCATION_CACHE_EVICTION_INTERVAL)); - - double currentTime = now(); - std::vector toRemove; - int totalCount = 0; - - // Scan locationCache for expired entries - auto iter = cx->locationCache.randomRange(); - for (; iter != cx->locationCache.lastItem(); ++iter) { - if (iter->value() && iter->value()->hasCaches) { - // Check the expireTime of the first cache entry as a representative - // All entries in a range typically have similar expiration times - if (iter->value()->expireTime > 0.0 && iter->value()->expireTime <= currentTime) { - toRemove.push_back(iter->range()); - } - } - totalCount++; - if (totalCount > 1000 || toRemove.size() > 100) { - break; // Avoid long blocking scans - } - } - - // Remove expired entries - for (const auto& range : toRemove) { - cx->locationCache.insert(range, Reference()); - } - - if (!toRemove.empty()) { - CODE_PROBE(true, "LocationCacheCleanup removed some entries"); - TraceEvent("LocationCacheCleanup").detail("RemovedRanges", toRemove.size()); - } - } -} - -ACTOR static Future handleTssMismatches(DatabaseContext* cx) { - state Reference tr; - state KeyBackedMap tssMapDB = KeyBackedMap(tssMappingKeys.begin); - state KeyBackedMap tssMismatchDB = KeyBackedMap(tssMismatchKeys.begin); - loop { - // return to calling actor, cx might be destroyed already with the tr reset below. - // This gives ~DatabaseContext a chance to cancel this actor. - wait(delay(0)); - - // - state std::pair> data = waitNext(cx->tssMismatchStream.getFuture()); - // find ss pair id so we can remove it from the mapping - state UID tssPairID; - bool found = false; - for (const auto& it : cx->tssMapping) { - if (it.second.id() == data.first) { - tssPairID = it.first; - found = true; - break; - } - } - if (found) { - state bool quarantine = CLIENT_KNOBS->QUARANTINE_TSS_ON_MISMATCH; - TraceEvent(SevWarnAlways, quarantine ? "TSS_QuarantineMismatch" : "TSS_KillMismatch") - .detail("TSSID", data.first.toString()); - CODE_PROBE(quarantine, "Quarantining TSS because it got mismatch"); - CODE_PROBE(!quarantine, "Killing TSS because it got mismatch"); - - tr = makeReference(Database(Reference::addRef(cx))); - state int tries = 0; - loop { - try { - tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - if (quarantine) { - tr->set(tssQuarantineKeyFor(data.first), ""_sr); - } else { - tr->clear(serverTagKeyFor(data.first)); - } - tssMapDB.erase(tr, tssPairID); - - for (const DetailedTSSMismatch& d : data.second) { - // -> mismatch data - tssMismatchDB.set(tr, - Tuple::makeTuple(data.first.toString(), d.timestamp, d.mismatchId.toString()), - d.traceString); - } - - wait(tr->commit()); - - break; - } catch (Error& e) { - wait(tr->onError(e)); - } - tries++; - if (tries > 10) { - // Give up, it'll get another mismatch or a human will investigate eventually - TraceEvent("TSS_MismatchGaveUp").detail("TSSID", data.first.toString()); - break; - } - } - // clear out txn so that the extra DatabaseContext ref gets decref'd and we can free cx - tr = makeReference(); - } else { - CODE_PROBE(true, "Not handling TSS with mismatch because it's already gone"); - } - } -} - -ACTOR Future> getJSON(Database db, std::string jsonField = ""); - -struct SingleSpecialKeyImpl : SpecialKeyRangeReadImpl { - Future getRange(ReadYourWritesTransaction* ryw, - KeyRangeRef kr, - GetRangeLimits limitsHint) const override { - ASSERT(kr.contains(k)); - return map(f(ryw), [k = k](Optional v) { - RangeResult result; - if (v.present()) { - result.push_back_deep(result.arena(), KeyValueRef(k, v.get())); - } - return result; - }); - } - - SingleSpecialKeyImpl(KeyRef k, - const std::function>(ReadYourWritesTransaction*)>& f, - bool supportsTenants = false) - : SpecialKeyRangeReadImpl(singleKeyRange(k)), k(k), f(f), tenantSupport(supportsTenants) {} - - bool supportsTenants() const override { - CODE_PROBE(tenantSupport, "Single special key in tenant"); - return tenantSupport; - }; - -private: - Key k; - std::function>(ReadYourWritesTransaction*)> f; - bool tenantSupport; -}; - -class HealthMetricsRangeImpl : public SpecialKeyRangeAsyncImpl { -public: - explicit HealthMetricsRangeImpl(KeyRangeRef kr); - Future getRange(ReadYourWritesTransaction* ryw, - KeyRangeRef kr, - GetRangeLimits limitsHint) const override; -}; - -static RangeResult healthMetricsToKVPairs(const HealthMetrics& metrics, KeyRangeRef kr) { - RangeResult result; - if (CLIENT_BUGGIFY) - return result; - if (kr.contains("\xff\xff/metrics/health/aggregate"_sr) && metrics.worstStorageDurabilityLag != 0) { - json_spirit::mObject statsObj; - statsObj["batch_limited"] = metrics.batchLimited; - statsObj["tps_limit"] = metrics.tpsLimit; - statsObj["worst_storage_durability_lag"] = metrics.worstStorageDurabilityLag; - statsObj["limiting_storage_durability_lag"] = metrics.limitingStorageDurabilityLag; - statsObj["worst_storage_queue"] = metrics.worstStorageQueue; - statsObj["limiting_storage_queue"] = metrics.limitingStorageQueue; - statsObj["worst_log_queue"] = metrics.worstTLogQueue; - std::string statsString = - json_spirit::write_string(json_spirit::mValue(statsObj), json_spirit::Output_options::raw_utf8); - ValueRef bytes(result.arena(), statsString); - result.push_back(result.arena(), KeyValueRef("\xff\xff/metrics/health/aggregate"_sr, bytes)); - } - // tlog stats - { - int phase = 0; // Avoid comparing twice per loop iteration - for (const auto& [uid, logStats] : metrics.tLogQueue) { - StringRef k{ StringRef(uid.toString()).withPrefix("\xff\xff/metrics/health/log/"_sr, result.arena()) }; - if (phase == 0 && k >= kr.begin) { - phase = 1; - } - if (phase == 1) { - if (k < kr.end) { - json_spirit::mObject statsObj; - statsObj["log_queue"] = logStats; - std::string statsString = - json_spirit::write_string(json_spirit::mValue(statsObj), json_spirit::Output_options::raw_utf8); - ValueRef bytes(result.arena(), statsString); - result.push_back(result.arena(), KeyValueRef(k, bytes)); - } else { - break; - } - } - } - } - // Storage stats - { - int phase = 0; // Avoid comparing twice per loop iteration - for (const auto& [uid, storageStats] : metrics.storageStats) { - StringRef k{ StringRef(uid.toString()).withPrefix("\xff\xff/metrics/health/storage/"_sr, result.arena()) }; - if (phase == 0 && k >= kr.begin) { - phase = 1; - } - if (phase == 1) { - if (k < kr.end) { - json_spirit::mObject statsObj; - statsObj["storage_durability_lag"] = storageStats.storageDurabilityLag; - statsObj["storage_queue"] = storageStats.storageQueue; - statsObj["cpu_usage"] = storageStats.cpuUsage; - statsObj["disk_usage"] = storageStats.diskUsage; - std::string statsString = - json_spirit::write_string(json_spirit::mValue(statsObj), json_spirit::Output_options::raw_utf8); - ValueRef bytes(result.arena(), statsString); - result.push_back(result.arena(), KeyValueRef(k, bytes)); - } else { - break; - } - } - } - } - return result; -} - -ACTOR static Future healthMetricsGetRangeActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) { - HealthMetrics metrics = wait(ryw->getDatabase()->getHealthMetrics( - /*detailed ("per process")*/ kr.intersects( - KeyRangeRef("\xff\xff/metrics/health/storage/"_sr, "\xff\xff/metrics/health/storage0"_sr)) || - kr.intersects(KeyRangeRef("\xff\xff/metrics/health/log/"_sr, "\xff\xff/metrics/health/log0"_sr)))); - return healthMetricsToKVPairs(metrics, kr); -} - -HealthMetricsRangeImpl::HealthMetricsRangeImpl(KeyRangeRef kr) : SpecialKeyRangeAsyncImpl(kr) {} - -Future HealthMetricsRangeImpl::getRange(ReadYourWritesTransaction* ryw, - KeyRangeRef kr, - GetRangeLimits limitsHint) const { - return healthMetricsGetRangeActor(ryw, kr); -} - -ACTOR Future getClusterId(Database db) { - while (!db->clientInfo->get().clusterId.isValid()) { - wait(db->clientInfo->onChange()); - } - return db->clientInfo->get().clusterId; -} - -void DatabaseContext::initializeSpecialCounters() { - specialCounter(cc, "OutstandingWatches", [this] { return outstandingWatches; }); - specialCounter(cc, "WatchMapSize", [this] { return watchMap.size(); }); -} - -DatabaseContext::DatabaseContext(Reference>> connectionRecord, - Reference> clientInfo, - Reference> const> coordinator, - Future clientInfoMonitor, - TaskPriority taskID, - LocalityData const& clientLocality, - EnableLocalityLoadBalance enableLocalityLoadBalance, - LockAware lockAware, - IsInternal internal, - int _apiVersion, - IsSwitchable switchable, - Optional defaultTenant) - : dbId(deterministicRandom()->randomUniqueID()), lockAware(lockAware), switchable(switchable), - connectionRecord(connectionRecord), proxyProvisional(false), clientLocality(clientLocality), - enableLocalityLoadBalance(enableLocalityLoadBalance), defaultTenant(defaultTenant), internal(internal), - cc("TransactionMetrics", dbId.toString()), transactionReadVersions("ReadVersions", cc), - transactionReadVersionsThrottled("ReadVersionsThrottled", cc), - transactionReadVersionsCompleted("ReadVersionsCompleted", cc), - transactionReadVersionBatches("ReadVersionBatches", cc), - transactionBatchReadVersions("BatchPriorityReadVersions", cc), - transactionDefaultReadVersions("DefaultPriorityReadVersions", cc), - transactionImmediateReadVersions("ImmediatePriorityReadVersions", cc), - transactionBatchReadVersionsCompleted("BatchPriorityReadVersionsCompleted", cc), - transactionDefaultReadVersionsCompleted("DefaultPriorityReadVersionsCompleted", cc), - transactionImmediateReadVersionsCompleted("ImmediatePriorityReadVersionsCompleted", cc), - transactionLogicalReads("LogicalUncachedReads", cc), transactionPhysicalReads("PhysicalReadRequests", cc), - transactionPhysicalReadsCompleted("PhysicalReadRequestsCompleted", cc), - transactionGetKeyRequests("GetKeyRequests", cc), transactionGetValueRequests("GetValueRequests", cc), - transactionGetRangeRequests("GetRangeRequests", cc), - transactionGetMappedRangeRequests("GetMappedRangeRequests", cc), - transactionGetRangeStreamRequests("GetRangeStreamRequests", cc), transactionWatchRequests("WatchRequests", cc), - transactionGetAddressesForKeyRequests("GetAddressesForKeyRequests", cc), transactionBytesRead("BytesRead", cc), - transactionKeysRead("KeysRead", cc), transactionMetadataVersionReads("MetadataVersionReads", cc), - transactionCommittedMutations("CommittedMutations", cc), - transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionSetMutations("SetMutations", cc), - transactionClearMutations("ClearMutations", cc), transactionAtomicMutations("AtomicMutations", cc), - transactionsCommitStarted("CommitStarted", cc), transactionsCommitCompleted("CommitCompleted", cc), - transactionKeyServerLocationRequests("KeyServerLocationRequests", cc), - transactionKeyServerLocationRequestsCompleted("KeyServerLocationRequestsCompleted", cc), - transactionStatusRequests("StatusRequests", cc), transactionTenantLookupRequests("TenantLookupRequests", cc), - transactionTenantLookupRequestsCompleted("TenantLookupRequestsCompleted", cc), transactionsTooOld("TooOld", cc), - transactionsFutureVersions("FutureVersions", cc), transactionsNotCommitted("NotCommitted", cc), - transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), - transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc), - transactionsLockRejected("LockRejected", cc), - transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc), - transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc), - transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), - - latencies(), readLatencies(), commitLatencies(), GRVLatencies(), mutationsPerCommit(), bytesPerCommit(), - outstandingWatches(0), sharedStatePtr(nullptr), lastGrvTime(0.0), cachedReadVersion(0), - lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0), lastProxyRequestTime(0.0), - transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), - coordinator(coordinator), apiVersion(_apiVersion), mvCacheInsertLocation(0), healthMetricsLastUpdated(0), - detailedHealthMetricsLastUpdated(0), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), - specialKeySpace(std::make_unique(specialKeys.begin, specialKeys.end, /* test */ false)), - connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) { - - DisabledTraceEvent("DatabaseContextCreated", dbId).backtrace(); - - connected = (clientInfo->get().commitProxies.size() && clientInfo->get().grvProxies.size()) - ? Void() - : clientInfo->onChange(); - - metadataVersionCache.resize(CLIENT_KNOBS->METADATA_VERSION_CACHE_SIZE); - maxOutstandingWatches = CLIENT_KNOBS->DEFAULT_MAX_OUTSTANDING_WATCHES; - - snapshotRywEnabled = apiVersion.hasSnapshotRYW() ? 1 : 0; - - logger = databaseLogger(this) && tssLogger(this); - locationCacheSize = g_network->isSimulated() ? CLIENT_KNOBS->LOCATION_CACHE_EVICTION_SIZE_SIM - : CLIENT_KNOBS->LOCATION_CACHE_EVICTION_SIZE; - - getValueSubmitted.init("NativeAPI.GetValueSubmitted"_sr); - getValueCompleted.init("NativeAPI.GetValueCompleted"_sr); - - clientDBInfoMonitor = monitorClientDBInfoChange(this, clientInfo, &proxiesChangeTrigger); - tssMismatchHandler = handleTssMismatches(this); - locationCacheCleanup = cleanupLocationCache(this); - clientStatusUpdater.actor = clientStatusUpdateActor(this); - - smoothMidShardSize.reset(CLIENT_KNOBS->INIT_MID_SHARD_BYTES); - globalConfig = std::make_unique(this); - - if (apiVersion.version() >= 740) { - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::METRICS, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique( - singleKeyRange("fault_tolerance_metrics_json"_sr) - .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::METRICS).begin))); - } - - if (apiVersion.version() >= 700) { - registerSpecialKeysImpl(SpecialKeySpace::MODULE::ERRORMSG, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique( - SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ERRORMSG).begin, - [](ReadYourWritesTransaction* ryw) -> Future> { - if (ryw->getSpecialKeySpaceErrorMsg().present()) - return Optional(ryw->getSpecialKeySpaceErrorMsg().get()); - else - return Optional(); - }, - true)); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique( - KeyRangeRef("options/"_sr, "options0"_sr) - .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique(SpecialKeySpace::getManagementApiCommandRange("exclude"))); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique(SpecialKeySpace::getManagementApiCommandRange("failed"))); - registerSpecialKeysImpl(SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique( - SpecialKeySpace::getManagementApiCommandRange("excludedlocality"))); - registerSpecialKeysImpl(SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique( - SpecialKeySpace::getManagementApiCommandRange("failedlocality"))); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique( - KeyRangeRef("in_progress_exclusion/"_sr, "in_progress_exclusion0"_sr) - .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::CONFIGURATION, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique( - KeyRangeRef("process/class_type/"_sr, "process/class_type0"_sr) - .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin))); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::CONFIGURATION, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique( - KeyRangeRef("process/class_source/"_sr, "process/class_source0"_sr) - .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin))); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique( - singleKeyRange("db_locked"_sr) - .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique( - singleKeyRange("consistency_check_suspended"_sr) - .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::GLOBALCONFIG, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG))); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::TRACING, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::TRACING))); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::CONFIGURATION, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique( - KeyRangeRef("coordinators/"_sr, "coordinators0"_sr) - .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin))); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique( - singleKeyRange("auto_coordinators"_sr) - .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique( - singleKeyRange("min_required_commit_version"_sr) - .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique( - singleKeyRange("version_epoch"_sr) - .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique( - KeyRangeRef("profiling/"_sr, "profiling0"_sr) - .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)), - /* deprecated */ ApiVersion::withClientProfilingDeprecated().version()); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique( - KeyRangeRef("maintenance/"_sr, "maintenance0"_sr) - .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique( - KeyRangeRef("data_distribution/"_sr, "data_distribution0"_sr) - .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::ACTORLINEAGE, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE))); - registerSpecialKeysImpl(SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique( - SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF))); - } - if (apiVersion.version() >= 630) { - registerSpecialKeysImpl(SpecialKeySpace::MODULE::TRANSACTION, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique(conflictingKeysRange)); - registerSpecialKeysImpl(SpecialKeySpace::MODULE::TRANSACTION, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique(readConflictRangeKeysRange)); - registerSpecialKeysImpl(SpecialKeySpace::MODULE::TRANSACTION, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique(writeConflictRangeKeysRange)); - registerSpecialKeysImpl(SpecialKeySpace::MODULE::METRICS, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique(ddStatsRange)); - registerSpecialKeysImpl(SpecialKeySpace::MODULE::METRICS, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique( - KeyRangeRef("\xff\xff/metrics/health/"_sr, "\xff\xff/metrics/health0"_sr))); - registerSpecialKeysImpl(SpecialKeySpace::MODULE::WORKERINTERFACE, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique( - KeyRangeRef("\xff\xff/worker_interfaces/"_sr, "\xff\xff/worker_interfaces0"_sr))); - registerSpecialKeysImpl(SpecialKeySpace::MODULE::STATUSJSON, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique( - "\xff\xff/status/json"_sr, - [](ReadYourWritesTransaction* ryw) -> Future> { - if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) { - ++ryw->getDatabase()->transactionStatusRequests; - return getJSON(ryw->getDatabase()); - } else { - return Optional(); - } - }, - true)); - registerSpecialKeysImpl(SpecialKeySpace::MODULE::CLUSTERFILEPATH, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique( - "\xff\xff/cluster_file_path"_sr, - [](ReadYourWritesTransaction* ryw) -> Future> { - try { - if (ryw->getDatabase().getPtr() && - ryw->getDatabase()->getConnectionRecord()) { - Optional output = - StringRef(ryw->getDatabase()->getConnectionRecord()->getLocation()); - return output; - } - } catch (Error& e) { - return e; - } - return Optional(); - }, - true)); - - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::CONNECTIONSTRING, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique( - "\xff\xff/connection_string"_sr, - [](ReadYourWritesTransaction* ryw) -> Future> { - try { - if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) { - Reference f = ryw->getDatabase()->getConnectionRecord(); - Optional output = StringRef(f->getConnectionString().toString()); - return output; - } - } catch (Error& e) { - return e; - } - return Optional(); - }, - true)); - registerSpecialKeysImpl(SpecialKeySpace::MODULE::CLUSTERID, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique( - "\xff\xff/cluster_id"_sr, - [](ReadYourWritesTransaction* ryw) -> Future> { - try { - if (ryw->getDatabase().getPtr()) { - return map(getClusterId(ryw->getDatabase()), [](UID id) { - return Optional(StringRef(id.toString())); - }); - } - } catch (Error& e) { - return e; - } - return Optional(); - }, - true)); - - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique(SpecialKeySpace::getManagementApiCommandRange("tenant"))); - } - throttleExpirer = recurring([this]() { expireThrottles(); }, CLIENT_KNOBS->TAG_THROTTLE_EXPIRATION_INTERVAL); - - if (BUGGIFY) { - DatabaseContext::debugUseTags = true; - } - - initializeSpecialCounters(); -} - -DatabaseContext::DatabaseContext(const Error& err) - : deferredError(err), internal(IsInternal::False), cc("TransactionMetrics"), - transactionReadVersions("ReadVersions", cc), transactionReadVersionsThrottled("ReadVersionsThrottled", cc), - transactionReadVersionsCompleted("ReadVersionsCompleted", cc), - transactionReadVersionBatches("ReadVersionBatches", cc), - transactionBatchReadVersions("BatchPriorityReadVersions", cc), - transactionDefaultReadVersions("DefaultPriorityReadVersions", cc), - transactionImmediateReadVersions("ImmediatePriorityReadVersions", cc), - transactionBatchReadVersionsCompleted("BatchPriorityReadVersionsCompleted", cc), - transactionDefaultReadVersionsCompleted("DefaultPriorityReadVersionsCompleted", cc), - transactionImmediateReadVersionsCompleted("ImmediatePriorityReadVersionsCompleted", cc), - transactionLogicalReads("LogicalUncachedReads", cc), transactionPhysicalReads("PhysicalReadRequests", cc), - transactionPhysicalReadsCompleted("PhysicalReadRequestsCompleted", cc), - transactionGetKeyRequests("GetKeyRequests", cc), transactionGetValueRequests("GetValueRequests", cc), - transactionGetRangeRequests("GetRangeRequests", cc), - transactionGetMappedRangeRequests("GetMappedRangeRequests", cc), - transactionGetRangeStreamRequests("GetRangeStreamRequests", cc), transactionWatchRequests("WatchRequests", cc), - transactionGetAddressesForKeyRequests("GetAddressesForKeyRequests", cc), transactionBytesRead("BytesRead", cc), - transactionKeysRead("KeysRead", cc), transactionMetadataVersionReads("MetadataVersionReads", cc), - transactionCommittedMutations("CommittedMutations", cc), - transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionSetMutations("SetMutations", cc), - transactionClearMutations("ClearMutations", cc), transactionAtomicMutations("AtomicMutations", cc), - transactionsCommitStarted("CommitStarted", cc), transactionsCommitCompleted("CommitCompleted", cc), - transactionKeyServerLocationRequests("KeyServerLocationRequests", cc), - transactionKeyServerLocationRequestsCompleted("KeyServerLocationRequestsCompleted", cc), - transactionStatusRequests("StatusRequests", cc), transactionTenantLookupRequests("TenantLookupRequests", cc), - transactionTenantLookupRequestsCompleted("TenantLookupRequestsCompleted", cc), transactionsTooOld("TooOld", cc), - transactionsFutureVersions("FutureVersions", cc), transactionsNotCommitted("NotCommitted", cc), - transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), - transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc), - transactionsLockRejected("LockRejected", cc), - transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc), - transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc), - transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), latencies(), readLatencies(), - commitLatencies(), GRVLatencies(), mutationsPerCommit(), bytesPerCommit(), sharedStatePtr(nullptr), - transactionTracingSample(false), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), - connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())), outstandingWatches(0) { - initializeSpecialCounters(); -} - -// Static constructor used by server processes to create a DatabaseContext -// For internal (fdbserver) use only -Database DatabaseContext::create(Reference> clientInfo, - Future clientInfoMonitor, - LocalityData clientLocality, - EnableLocalityLoadBalance enableLocalityLoadBalance, - TaskPriority taskID, - LockAware lockAware, - int apiVersion, - IsSwitchable switchable) { - return Database(new DatabaseContext(Reference>>(), - clientInfo, - makeReference>>(), - clientInfoMonitor, - taskID, - clientLocality, - enableLocalityLoadBalance, - lockAware, - IsInternal::True, - apiVersion, - switchable)); -} - -DatabaseContext::~DatabaseContext() { - clientDBInfoMonitor.cancel(); - monitorTssInfoChange.cancel(); - tssMismatchHandler.cancel(); - storage = nullptr; - - if (grvUpdateHandler.isValid()) { - grvUpdateHandler.cancel(); - } - if (sharedStatePtr) { - sharedStatePtr->delRef(sharedStatePtr); - } - for (auto it = server_interf.begin(); it != server_interf.end(); it = server_interf.erase(it)) - it->second->notifyContextDestroyed(); - ASSERT_ABORT(server_interf.empty()); - locationCache.insert(allKeys, Reference()); - - DisabledTraceEvent("DatabaseContextDestructed", dbId).backtrace(); -} diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 08c002df922..1b62a1052e9 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -669,7 +669,6 @@ ACTOR Future databaseLogger(DatabaseContext* cx) { cx->cc.logToTraceEvent(ev); - ev.detail("LocationCacheEntryCount", cx->locationCache.size()); ev.detail("MeanLatency", cx->latencies.mean()) .detail("MedianLatency", cx->latencies.median()) .detail("Latency90", cx->latencies.percentile(0.90)) @@ -1077,6 +1076,93 @@ Reference addCaches(const Reference& loc, return makeReference(interfaces, true); } +ACTOR static Future cleanupLocationCache(DatabaseContext* cx) { + // Only run cleanup if TTL is enabled + if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL == 0.0) { + return Void(); + } + + TraceEvent("LocationCacheCleanup1") + .detail("LOCATION_CACHE_ENTRY_TTL", CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL) + .detail("LOCATION_CACHE_ENTRY_REFRESH_TIME", CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME) + .detail("LOCATION_CACHE_EVICTION_INTERVAL", CLIENT_KNOBS->LOCATION_CACHE_EVICTION_INTERVAL) + .detail("LOCATION_CACHE_MAX_ENTRIES_PER_ITERATION", CLIENT_KNOBS->LOCATION_CACHE_MAX_ENTRIES_PER_ITERATION) + .detail("LOCATION_CACHE_MAX_REMOVED_ENTRIES_PER_ITERATION", + CLIENT_KNOBS->LOCATION_CACHE_MAX_REMOVED_ENTRIES_PER_ITERATION); + + loop { + wait(delay(CLIENT_KNOBS->LOCATION_CACHE_EVICTION_INTERVAL)); + + double currentTime = now(); + std::vector toRemove; + int totalCount = 0; + + // Scan locationCache for expired entries + auto iter = cx->locationCache.randomRange(); + for (; iter != cx->locationCache.lastItem(); ++iter) { + if (iter->value()) { + // Check the expireTime of the first cache entry as a representative + // All entries in a range typically have similar expiration times + if (iter->value()->expireTime > 0.0 && iter->value()->expireTime <= currentTime) { + toRemove.push_back(iter->range()); + TraceEvent("LocationCacheCleanup3") + .detail("TotalCount", totalCount) + .detail("StorageServerInterfaceCacheSize", iter->value()->size()) + .detail("ExpireTime", iter->value()->expireTime) + .detail("CurrentTime", currentTime); + } + } + totalCount++; + // If LOCATION_CACHE_MAX_ENTRIES_PER_ITERATION is set to a negative number the limitation per iteration is + // removed, same for LOCATION_CACHE_MAX_REMOVED_ENTRIES_PER_ITERATION. + if ((CLIENT_KNOBS->LOCATION_CACHE_MAX_ENTRIES_PER_ITERATION >= 0 && + totalCount > CLIENT_KNOBS->LOCATION_CACHE_MAX_ENTRIES_PER_ITERATION) || + CLIENT_KNOBS->LOCATION_CACHE_MAX_REMOVED_ENTRIES_PER_ITERATION >= 0 || + toRemove.size() > CLIENT_KNOBS->LOCATION_CACHE_MAX_REMOVED_ENTRIES_PER_ITERATION) { + break; // Avoid long blocking scans + } + } + + // TODO (j-scheuermann): This approach is quite expensive and scans all cache locations. + // auto ranges = cx->locationCache.ranges(); + // for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) { + // if (iter->value()) { + // // Check the expireTime of the first cache entry as a representative + // // All entries in a range typically have similar expiration times + // if (iter->value()->expireTime > 0.0 && iter->value()->expireTime <= currentTime) { + // toRemove.push_back(iter->range()); + // TraceEvent("LocationCacheCleanup2") + // .detail("Begin", iter.begin()) + // .detail("End", iter.end()) + // .detail("ExpireTime", iter->value()->expireTime) + // .detail("CurrentTime", currentTime); + // } + // } + + // totalCount++; + // } + + // Remove expired entries + for (const auto& range : toRemove) { + cx->locationCache.insert(range, Reference()); + } + + TraceEvent("LocationCacheCleanup4") + .detail("RemovedRanges", toRemove.size()) + .detail("TotalCount", totalCount) + .detail("CacheSize", cx->locationCache.size()) + .detail("Duration", now() - currentTime); + + if (!toRemove.empty()) { + CODE_PROBE(true, "LocationCacheCleanup removed some entries"); + TraceEvent("LocationCacheCleanup") + .detail("RemovedRanges", toRemove.size()) + .detail("CheckedEntries", totalCount) + .detail("CacheSize", cx->locationCache.size()); + } + } +} + ACTOR Future updateCachedRanges(DatabaseContext* self, std::map* cacheServers) { state Transaction tr; state Value trueValue = storageCacheValue(std::vector{ 0 }); @@ -1611,6 +1697,7 @@ DatabaseContext::DatabaseContext(ReferenceclearFailedEndpointOnHealthyServer(endpoint); } + + // TODO (j-scheuermann): Track if an endpoint and the server is failed. + return false; } @@ -3122,9 +3214,16 @@ Future getKeyLocation(Database const& cx, bool onlyEndpointFailedAndNeedRefresh = false; for (int i = 0; i < locationInfo.get().locations->size(); i++) { - if (checkOnlyEndpointFailed(cx, locationInfo.get().locations->get(i, member).getEndpoint())) { + auto endpoint = locationInfo.get().locations->get(i, member).getEndpoint(); + // TODO (j-scheuermann) Update? How is a endpoint determined to be failed? + if (checkOnlyEndpointFailed(cx, endpoint)) { onlyEndpointFailedAndNeedRefresh = true; } + + TraceEvent("GetKeyLocation") + .detail("LocationSize", locationInfo.get().locations->size()) + .detail("PrimaryAddress", endpoint.getPrimaryAddress()) + .detail("Failed", onlyEndpointFailedAndNeedRefresh); } if (onlyEndpointFailedAndNeedRefresh) { @@ -3274,9 +3373,16 @@ Future> getKeyRangeLocations(Database const& c for (const auto& locationInfo : locations) { bool onlyEndpointFailedAndNeedRefresh = false; for (int i = 0; i < locationInfo.locations->size(); i++) { - if (checkOnlyEndpointFailed(cx, locationInfo.locations->get(i, member).getEndpoint())) { + auto endpoint = locationInfo.locations->get(i, member).getEndpoint(); + // TODO (j-scheuermann)L Update? How does the failure monitor detect a failed endpoint? + if (checkOnlyEndpointFailed(cx, endpoint)) { onlyEndpointFailedAndNeedRefresh = true; } + + TraceEvent("GetKeyRangeLocations") + .detail("LocationSize", locationInfo.locations->size()) + .detail("PrimaryAddress", endpoint.getPrimaryAddress()) + .detail("Failed", onlyEndpointFailedAndNeedRefresh); } if (onlyEndpointFailedAndNeedRefresh) { diff --git a/fdbclient/include/fdbclient/ClientKnobs.h b/fdbclient/include/fdbclient/ClientKnobs.h index ba0d5440e03..568ab593ad9 100644 --- a/fdbclient/include/fdbclient/ClientKnobs.h +++ b/fdbclient/include/fdbclient/ClientKnobs.h @@ -103,6 +103,12 @@ class ClientKnobs : public KnobsImpl { // How often to run the background actor that removes expired location cache entries. // Only has effect when LOCATION_CACHE_ENTRY_TTL > 0. Default 60 seconds. double LOCATION_CACHE_EVICTION_INTERVAL; + // The maximum entries per cache evition iteration to check if they are expired. + // If set to a negative number all entries will be validated. + double LOCATION_CACHE_MAX_ENTRIES_PER_ITERATION; + // The maximum entires per cache evition iteration to remove. + // If set to a negative number all expired cache entries will be removed. + double LOCATION_CACHE_MAX_REMOVED_ENTRIES_PER_ITERATION; int GET_RANGE_SHARD_LIMIT; int WARM_RANGE_SHARD_LIMIT; From c496f34cbc18a705834d2ad50b6131584167b2f3 Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Fri, 28 Nov 2025 18:17:53 +0100 Subject: [PATCH 5/7] Implement location cache cleanup with iterative approach to check all entries --- fdbclient/ClientKnobs.cpp | 14 +- fdbclient/NativeAPI.actor.cpp | 181 +++++++----------- fdbclient/include/fdbclient/ClientKnobs.h | 11 -- fdbclient/include/fdbclient/DatabaseContext.h | 10 +- 4 files changed, 76 insertions(+), 140 deletions(-) diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index e5845406719..1046e2ee470 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -97,17 +97,11 @@ void ClientKnobs::initialize(Randomize randomize) { init( LOCATION_CACHE_EVICTION_SIZE_SIM, 10 ); if( randomize && BUGGIFY ) LOCATION_CACHE_EVICTION_SIZE_SIM = 3; init( LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD, 60 ); init( LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL, 60 ); - // TTL disabled by default to preserve existing behavior; set > 0 to enable - init( LOCATION_CACHE_ENTRY_TTL, 0.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_TTL = deterministicRandom()->randomInt(10, 60); - // When cache entry is used, extend its expiration by this amount (sliding window) - init( LOCATION_CACHE_ENTRY_REFRESH_TIME, 300.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_REFRESH_TIME = deterministicRandom()->randomInt(10, 60); - // Run location cache cleanup every 60 seconds when TTL is enabled - init( LOCATION_CACHE_EVICTION_INTERVAL, 60.0 ); + // The interval in seconds to run the cache eviction logic. If enabled will iterate over the location cache entries and remove + // stale/failed entries. + init( LOCATION_CACHE_EVICTION_INTERVAL, 0.0 ); // The maximum entries per cache evition iteration to check if they are expired. If set to a negative number all entries will be validated. - init( LOCATION_CACHE_MAX_ENTRIES_PER_ITERATION, 1000.0 ); - // The maximum entires per cache evition iteration to remove. If set to a negative number all expired cache entries will be removed. - init( LOCATION_CACHE_MAX_REMOVED_ENTRIES_PER_ITERATION, 60.0 ); - + init( LOCATION_CACHE_MAX_ENTRIES_PER_ITERATION, 1000.0 ); init( GET_RANGE_SHARD_LIMIT, 2 ); init( WARM_RANGE_SHARD_LIMIT, 100 ); diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 1b62a1052e9..c78f042426a 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1076,89 +1076,85 @@ Reference addCaches(const Reference& loc, return makeReference(interfaces, true); } +// cleanupLocationCache is an actor that periodically cleans up stale/failed entries in the client's location cache by +// removing entries that point to failed storage servers. The cleanup of the location cache is required to ensure that +// the client is not connecting to old/stale storage servers. ACTOR static Future cleanupLocationCache(DatabaseContext* cx) { - // Only run cleanup if TTL is enabled - if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL == 0.0) { + // Only if the LOCATION_CACHE_EVICTION_INTERVAL is set to a number greater than 0 we have to perform the location + // cache validation. + if (CLIENT_KNOBS->LOCATION_CACHE_EVICTION_INTERVAL <= 0.0) { return Void(); } - TraceEvent("LocationCacheCleanup1") - .detail("LOCATION_CACHE_ENTRY_TTL", CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL) - .detail("LOCATION_CACHE_ENTRY_REFRESH_TIME", CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME) - .detail("LOCATION_CACHE_EVICTION_INTERVAL", CLIENT_KNOBS->LOCATION_CACHE_EVICTION_INTERVAL) - .detail("LOCATION_CACHE_MAX_ENTRIES_PER_ITERATION", CLIENT_KNOBS->LOCATION_CACHE_MAX_ENTRIES_PER_ITERATION) - .detail("LOCATION_CACHE_MAX_REMOVED_ENTRIES_PER_ITERATION", - CLIENT_KNOBS->LOCATION_CACHE_MAX_REMOVED_ENTRIES_PER_ITERATION); + // Track the current position by key to continue after we reached + // CLIENT_KNOBS->LOCATION_CACHE_MAX_ENTRIES_PER_ITERATION in an iteration. Storing the visited key, helps to perform + // the validation on all keys in a circular manner. + state Key currentValidationPosition; + // Iterate over the location caches and check if any of the storage servers have failed. In case that a storage + // server has failed, the location cache entry will be removed/invalidated. loop { wait(delay(CLIENT_KNOBS->LOCATION_CACHE_EVICTION_INTERVAL)); - double currentTime = now(); std::vector toRemove; - int totalCount = 0; + int checkedEntries = 0; + // Fetch the current ranges of the location cache. + auto ranges = cx->locationCache.ranges(); + // Find where we left off using KEY (not iterator). + auto iter = ranges.begin(); + if (currentValidationPosition.size() > 0) { + // Seek to last position + iter = cx->locationCache.rangeContaining(currentValidationPosition); + if (iter != ranges.end() && iter.range().begin == currentValidationPosition) { + ++iter; // Move past the last processed entry, since we already checked that key range + } + } + + for (; iter != ranges.end(); ++iter) { + // Avoid long blocking scans. + if (CLIENT_KNOBS->LOCATION_CACHE_MAX_ENTRIES_PER_ITERATION >= 0 && + checkedEntries >= CLIENT_KNOBS->LOCATION_CACHE_MAX_ENTRIES_PER_ITERATION) { + break; + } - // Scan locationCache for expired entries - auto iter = cx->locationCache.randomRange(); - for (; iter != cx->locationCache.lastItem(); ++iter) { if (iter->value()) { - // Check the expireTime of the first cache entry as a representative - // All entries in a range typically have similar expiration times - if (iter->value()->expireTime > 0.0 && iter->value()->expireTime <= currentTime) { - toRemove.push_back(iter->range()); - TraceEvent("LocationCacheCleanup3") - .detail("TotalCount", totalCount) - .detail("StorageServerInterfaceCacheSize", iter->value()->size()) - .detail("ExpireTime", iter->value()->expireTime) - .detail("CurrentTime", currentTime); - } - } - totalCount++; - // If LOCATION_CACHE_MAX_ENTRIES_PER_ITERATION is set to a negative number the limitation per iteration is - // removed, same for LOCATION_CACHE_MAX_REMOVED_ENTRIES_PER_ITERATION. - if ((CLIENT_KNOBS->LOCATION_CACHE_MAX_ENTRIES_PER_ITERATION >= 0 && - totalCount > CLIENT_KNOBS->LOCATION_CACHE_MAX_ENTRIES_PER_ITERATION) || - CLIENT_KNOBS->LOCATION_CACHE_MAX_REMOVED_ENTRIES_PER_ITERATION >= 0 || - toRemove.size() > CLIENT_KNOBS->LOCATION_CACHE_MAX_REMOVED_ENTRIES_PER_ITERATION) { - break; // Avoid long blocking scans - } - } - - // TODO (j-scheuermann): This approach is quite expensive and scans all cache locations. - // auto ranges = cx->locationCache.ranges(); - // for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) { - // if (iter->value()) { - // // Check the expireTime of the first cache entry as a representative - // // All entries in a range typically have similar expiration times - // if (iter->value()->expireTime > 0.0 && iter->value()->expireTime <= currentTime) { - // toRemove.push_back(iter->range()); - // TraceEvent("LocationCacheCleanup2") - // .detail("Begin", iter.begin()) - // .detail("End", iter.end()) - // .detail("ExpireTime", iter->value()->expireTime) - // .detail("CurrentTime", currentTime); - // } - // } - - // totalCount++; - // } + auto& locationInfo = iter->value(); + // Iterate over all storage interfaces for this location (key range) cache. + for (int i = 0; i < locationInfo->size(); ++i) { + const auto& interf = (*locationInfo)[i]; + // Check if the address is marked as failed in the FailureMonitor. If so remove this key range and + // stop iterating over the other storage interfaces. A single failed storage interface is enough to + // remove the cached entry. + if (IFailureMonitor::failureMonitor().getState(interf->interf.address()).isFailed()) { + toRemove.push_back(iter->range()); + break; + } + } + + // Update the current validated position (key) to the key that starts the current range. + currentValidationPosition = iter.range().begin; + } + + checkedEntries++; + } + + // If we completed a full scan we have to reset the validated position (key) to an empty key and + // start in the next iteration with the first range. + if (iter == ranges.end()) { + currentValidationPosition = Key(); + } // Remove expired entries for (const auto& range : toRemove) { cx->locationCache.insert(range, Reference()); } - TraceEvent("LocationCacheCleanup4") - .detail("RemovedRanges", toRemove.size()) - .detail("TotalCount", totalCount) - .detail("CacheSize", cx->locationCache.size()) - .detail("Duration", now() - currentTime); - if (!toRemove.empty()) { CODE_PROBE(true, "LocationCacheCleanup removed some entries"); TraceEvent("LocationCacheCleanup") - .detail("RemovedRanges", toRemove.size()) - .detail("CheckedEntries", totalCount) - .detail("CacheSize", cx->locationCache.size()); + .detail("NumRemovedRanges", toRemove.size()) + .detail("NumCheckedEntries", checkedEntries) + .detail("NumLocalityCacheEntries", cx->locationCache.size()); } } } @@ -2041,14 +2037,8 @@ Optional DatabaseContext::getCachedLocation(const TenantIn auto range = isBackward ? locationCache.rangeContainingKeyBefore(resolvedKey) : locationCache.rangeContaining(resolvedKey); - auto& loc = range->value(); - if (loc) { - // Cache hit: extend expiration time if refresh knob is set - if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME > 0.0 && loc->expireTime > 0.0) { - loc->expireTime = now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME; - } - return KeyRangeLocationInfo(toPrefixRelativeRange(range->range(), tenant.prefix), loc); - } + if (range->value()) + return KeyRangeLocationInfo(toPrefixRelativeRange(range->range(), tenant.prefix), range->value()); return Optional(); } @@ -2077,10 +2067,6 @@ bool DatabaseContext::getCachedLocations(const TenantInfo& tenant, result.clear(); return false; } - // Cache hit: extend expiration time if refresh knob is set - if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME > 0.0 && r->value()->expireTime > 0.0) { - r->value()->expireTime = now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME; - } result.emplace_back(toPrefixRelativeRange(r->range() & resolvedRange, tenant.prefix), r->value()); if (result.size() == limit || begin == end) { break; @@ -3186,12 +3172,9 @@ bool checkOnlyEndpointFailed(const Database& cx, const Endpoint& endpoint) { return true; } } else { - // todo check if endpoint failed ! cx->clearFailedEndpointOnHealthyServer(endpoint); } - // TODO (j-scheuermann): Track if an endpoint and the server is failed. - return false; } @@ -3212,26 +3195,13 @@ Future getKeyLocation(Database const& cx, cx, tenant, key, spanContext, debugID, useProvisionalProxies, isBackward, version); } - bool onlyEndpointFailedAndNeedRefresh = false; for (int i = 0; i < locationInfo.get().locations->size(); i++) { - auto endpoint = locationInfo.get().locations->get(i, member).getEndpoint(); - // TODO (j-scheuermann) Update? How is a endpoint determined to be failed? - if (checkOnlyEndpointFailed(cx, endpoint)) { - onlyEndpointFailedAndNeedRefresh = true; + if (checkOnlyEndpointFailed(cx, locationInfo.get().locations->get(i, member).getEndpoint())) { + cx->invalidateCache(tenant.prefix, key); + // Refresh the cache with a new getKeyLocations made to proxies. + return getKeyLocation_internal( + cx, tenant, key, spanContext, debugID, useProvisionalProxies, isBackward, version); } - - TraceEvent("GetKeyLocation") - .detail("LocationSize", locationInfo.get().locations->size()) - .detail("PrimaryAddress", endpoint.getPrimaryAddress()) - .detail("Failed", onlyEndpointFailedAndNeedRefresh); - } - - if (onlyEndpointFailedAndNeedRefresh) { - cx->invalidateCache(tenant.prefix, key); - - // Refresh the cache with a new getKeyLocations made to proxies. - return getKeyLocation_internal( - cx, tenant, key, spanContext, debugID, useProvisionalProxies, isBackward, version); } return locationInfo.get(); @@ -3371,23 +3341,12 @@ Future> getKeyRangeLocations(Database const& c bool foundFailed = false; for (const auto& locationInfo : locations) { - bool onlyEndpointFailedAndNeedRefresh = false; for (int i = 0; i < locationInfo.locations->size(); i++) { - auto endpoint = locationInfo.locations->get(i, member).getEndpoint(); - // TODO (j-scheuermann)L Update? How does the failure monitor detect a failed endpoint? - if (checkOnlyEndpointFailed(cx, endpoint)) { - onlyEndpointFailedAndNeedRefresh = true; + if (checkOnlyEndpointFailed(cx, locationInfo.locations->get(i, member).getEndpoint())) { + cx->invalidateCache(tenant.prefix, locationInfo.range.begin); + foundFailed = true; + break; } - - TraceEvent("GetKeyRangeLocations") - .detail("LocationSize", locationInfo.locations->size()) - .detail("PrimaryAddress", endpoint.getPrimaryAddress()) - .detail("Failed", onlyEndpointFailedAndNeedRefresh); - } - - if (onlyEndpointFailedAndNeedRefresh) { - cx->invalidateCache(tenant.prefix, locationInfo.range.begin); - foundFailed = true; } } diff --git a/fdbclient/include/fdbclient/ClientKnobs.h b/fdbclient/include/fdbclient/ClientKnobs.h index 568ab593ad9..3c9a2c1c8b5 100644 --- a/fdbclient/include/fdbclient/ClientKnobs.h +++ b/fdbclient/include/fdbclient/ClientKnobs.h @@ -94,21 +94,10 @@ class ClientKnobs : public KnobsImpl { int LOCATION_CACHE_EVICTION_SIZE_SIM; double LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD; double LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL; - // If > 0, each key-location cache entry expires this many seconds after insertion. - // Default 0 disables TTL expiration and keeps current behavior. - double LOCATION_CACHE_ENTRY_TTL; - // If > 0, extend the expireTime by this many seconds when a cached entry is used (cache hit). - // Only has effect when LOCATION_CACHE_ENTRY_TTL > 0. - double LOCATION_CACHE_ENTRY_REFRESH_TIME; - // How often to run the background actor that removes expired location cache entries. - // Only has effect when LOCATION_CACHE_ENTRY_TTL > 0. Default 60 seconds. double LOCATION_CACHE_EVICTION_INTERVAL; // The maximum entries per cache evition iteration to check if they are expired. // If set to a negative number all entries will be validated. double LOCATION_CACHE_MAX_ENTRIES_PER_ITERATION; - // The maximum entires per cache evition iteration to remove. - // If set to a negative number all expired cache entries will be removed. - double LOCATION_CACHE_MAX_REMOVED_ENTRIES_PER_ITERATION; int GET_RANGE_SHARD_LIMIT; int WARM_RANGE_SHARD_LIMIT; diff --git a/fdbclient/include/fdbclient/DatabaseContext.h b/fdbclient/include/fdbclient/DatabaseContext.h index dd0ebfbca76..9b73e71cf97 100644 --- a/fdbclient/include/fdbclient/DatabaseContext.h +++ b/fdbclient/include/fdbclient/DatabaseContext.h @@ -64,13 +64,9 @@ class StorageServerInfo : public ReferencedInterface { struct LocationInfo : MultiInterface>, FastAllocated { using Locations = MultiInterface>; explicit LocationInfo(const std::vector>>& v) - : Locations(v), - expireTime(CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL > 0.0 ? now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL - : 0.0) {} + : Locations(v) {} LocationInfo(const std::vector>>& v, bool hasCaches) - : Locations(v), hasCaches(hasCaches), - expireTime(CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL > 0.0 ? now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL - : 0.0) {} + : Locations(v), hasCaches(hasCaches) {} LocationInfo(const LocationInfo&) = delete; LocationInfo(LocationInfo&&) = delete; LocationInfo& operator=(const LocationInfo&) = delete; @@ -78,8 +74,6 @@ struct LocationInfo : MultiInterface Reference locations() { return Reference::addRef(this); } bool hasCaches = false; - // Absolute expiration time for this cache entry. 0 means no expiration (TTL disabled). - double expireTime = 0.0; }; using CommitProxyInfo = ModelInterface; From f4bad0af71c18df300c8ae1490a14deaf3733254 Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Mon, 1 Dec 2025 16:26:47 +0100 Subject: [PATCH 6/7] Check the endpoint instead of the address for failures --- fdbclient/NativeAPI.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index c78f042426a..031a987301a 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1122,10 +1122,10 @@ ACTOR static Future cleanupLocationCache(DatabaseContext* cx) { // Iterate over all storage interfaces for this location (key range) cache. for (int i = 0; i < locationInfo->size(); ++i) { const auto& interf = (*locationInfo)[i]; - // Check if the address is marked as failed in the FailureMonitor. If so remove this key range and + // Check if the endpoint is marked as failed in the FailureMonitor. If so remove this key range and // stop iterating over the other storage interfaces. A single failed storage interface is enough to // remove the cached entry. - if (IFailureMonitor::failureMonitor().getState(interf->interf.address()).isFailed()) { + if (IFailureMonitor::failureMonitor().getState(interf->interf.getValue.getEndpoint()).isFailed()) { toRemove.push_back(iter->range()); break; } From 592eb23cf20b5f99b3941e992e8452b2dc7f1bd4 Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Wed, 17 Dec 2025 19:10:28 +0100 Subject: [PATCH 7/7] Add additional logging --- fdbclient/NativeAPI.actor.cpp | 63 +++++++++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 6 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 031a987301a..4caae864feb 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1097,7 +1097,9 @@ ACTOR static Future cleanupLocationCache(DatabaseContext* cx) { wait(delay(CLIENT_KNOBS->LOCATION_CACHE_EVICTION_INTERVAL)); std::vector toRemove; + std::set logged; int checkedEntries = 0; + int entriesWithValue = 0; // Fetch the current ranges of the location cache. auto ranges = cx->locationCache.ranges(); // Find where we left off using KEY (not iterator). @@ -1118,14 +1120,37 @@ ACTOR static Future cleanupLocationCache(DatabaseContext* cx) { } if (iter->value()) { + entriesWithValue++; auto& locationInfo = iter->value(); // Iterate over all storage interfaces for this location (key range) cache. for (int i = 0; i < locationInfo->size(); ++i) { const auto& interf = (*locationInfo)[i]; + const auto addr = interf->interf.address(); + + // Added some debugging output, remove after testing and make use of IFailureMonitor::failureMonitor().getState(endpoint).isFailed() + const auto endpoint = interf->interf.getValue.getEndpoint(); + const auto endpointFailed = IFailureMonitor::failureMonitor().getState(endpoint).isFailed(); + + if (logged.count(addr) == 0) { + const auto stableAddress = interf->interf.stableAddress(); + const auto failureInformation = cx->getEndpointFailureInfo(endpoint); + TraceEvent("LocationCacheCleanupDebug") + .detail("Address", addr) + .detail("StableAddress", stableAddress) + .detail("AddressFailed", IFailureMonitor::failureMonitor().getState(addr).isFailed()) + .detail("EndPointAddress", endpoint.getPrimaryAddress()) + .detail("EndPointStableAddress", endpoint.getStableAddress()) + .detail("EndpointFailed", endpointFailed) + .detail("StableAddressFailed", + IFailureMonitor::failureMonitor().getState(stableAddress).isFailed()) + .detail("FailedEndpointsOnHealthyServersInfoPresent", failureInformation.present()); + logged.insert(addr); + } + // Check if the endpoint is marked as failed in the FailureMonitor. If so remove this key range and // stop iterating over the other storage interfaces. A single failed storage interface is enough to // remove the cached entry. - if (IFailureMonitor::failureMonitor().getState(interf->interf.getValue.getEndpoint()).isFailed()) { + if (endpointFailed) { toRemove.push_back(iter->range()); break; } @@ -1144,18 +1169,44 @@ ACTOR static Future cleanupLocationCache(DatabaseContext* cx) { currentValidationPosition = Key(); } - // Remove expired entries + // Remove entries with failed storage server interfaces. for (const auto& range : toRemove) { cx->locationCache.insert(range, Reference()); } if (!toRemove.empty()) { CODE_PROBE(true, "LocationCacheCleanup removed some entries"); - TraceEvent("LocationCacheCleanup") - .detail("NumRemovedRanges", toRemove.size()) - .detail("NumCheckedEntries", checkedEntries) - .detail("NumLocalityCacheEntries", cx->locationCache.size()); } + + // Remove entries from the failedEndpointsOnHealthyServersInfo map if the last refresh time + // is 2 x CLIENT_KNOBS->LOCATION_CACHE_EVICTION_INTERVAL ago. Otherwise entries in the map + // will never removed. + auto expireTimestamp = now() + 2 * CLIENT_KNOBS->LOCATION_CACHE_EVICTION_INTERVAL; + std::vector failedEndpoints; + for (const auto failedEndpoint : cx->failedEndpointsOnHealthyServersInfo) { + if (failedEndpoint.second.lastRefreshTime <= expireTimestamp) { + failedEndpoints.push_back(failedEndpoint.first); + } + } + + for (const auto& failedEndpoint : failedEndpoints) { + TraceEvent("LocationCacheRemoveFailedEndpoint") + .detail("Address", failedEndpoint.getPrimaryAddress()); + cx->clearFailedEndpointOnHealthyServer(failedEndpoint); + } + + // TODO move back after debugging into the statement above. + TraceEvent("LocationCacheCleanup") + .detail("NumRemovedRanges", toRemove.size()) + .detail("NumCheckedEntries", checkedEntries) + .detail("NumLocalityCacheEntries", cx->locationCache.size()) + .detail("DatabaseContextServerInterfaceSize", cx->server_interf.size()) + .detail("FailedEndpointsOnHealthyServersInfoSize", cx->failedEndpointsOnHealthyServersInfo.size()) + .detail("FailedEndpointsOnHealthyServersInfoRemovedSize", failedEndpoints.size()) + .detail("SsidTagMappingSize", cx->ssidTagMapping.size()) + .detail("TssMapping", cx->tssMapping.size()) + .detail("ChangeFeedUpdaters", cx->changeFeedUpdaters.size()) + .detail("NumEntriesWithValue", entriesWithValue); } }