Skip to content

Commit 7d26a32

Browse files
authored
[Store] (CI run_tests_with_ssd failed / promotion-on-hit failed) eliminate race conditions (#2235)
* fix(transfer-engine): eliminate race condition in rePublishRpcMetaEntry Remove the redundant storage_plugin_->remove() call before set(). All storage backends (HTTP PUT, Redis SET, Etcd put) have upsert semantics, so remove-then-set creates a window where concurrent get() returns empty / 404, causing transfer failures (-800). Also change Json::UInt64 to Json::UInt for rpc_port to ensure existing == desired comparison works correctly after JSON parse. * fix: adjust eviction thread initial timing to prevent race in CI SSD tests Start last_discard_time with an already-elapsed window so the first loop iteration triggers DiscardExpiredProcessingReplicas immediately. Without this, a task admitted shortly after thread startup can survive the first reaper cycle and not be cleaned until ~2s later, causing promotion-on-hit tests that sleep for 2s to flake.
1 parent eed58e8 commit 7d26a32

2 files changed

Lines changed: 10 additions & 7 deletions

File tree

mooncake-store/src/master_service.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3476,7 +3476,13 @@ auto MasterService::NotifyPromotionFailure(const UUID& client_id,
34763476
void MasterService::EvictionThreadFunc() {
34773477
VLOG(1) << "action=eviction_thread_started";
34783478

3479-
auto last_discard_time = std::chrono::system_clock::now();
3479+
// Start with an already-elapsed window so the first loop iteration
3480+
// (after kEvictionThreadSleepMs) triggers DiscardExpiredProcessingReplicas.
3481+
// Without this, a task admitted shortly after thread startup can survive
3482+
// the first reaper cycle and not be cleaned until ~2s later, causing
3483+
// promotion-on-hit tests that sleep for 2s to flake.
3484+
auto last_discard_time =
3485+
std::chrono::system_clock::now() - put_start_release_timeout_sec_;
34803486
while (eviction_running_) {
34813487
const auto now = std::chrono::system_clock::now();
34823488
double used_ratio =

mooncake-transfer-engine/src/transfer_metadata.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1128,7 +1128,7 @@ int TransferMetadata::addRpcMetaEntry(const std::string &server_name,
11281128

11291129
Json::Value rpcMetaJSON;
11301130
rpcMetaJSON["ip_or_host_name"] = desc.ip_or_host_name;
1131-
rpcMetaJSON["rpc_port"] = static_cast<Json::UInt64>(desc.rpc_port);
1131+
rpcMetaJSON["rpc_port"] = static_cast<Json::UInt>(desc.rpc_port);
11321132
if (!storage_plugin_->set(rpc_meta_prefix_ + server_name, rpcMetaJSON)) {
11331133
LOG(ERROR) << "Failed to set location of " << server_name;
11341134
return ERR_METADATA;
@@ -1157,19 +1157,16 @@ int TransferMetadata::rePublishRpcMetaEntry(const std::string &server_name) {
11571157
if (storage_plugin_->get(full_key, existing)) {
11581158
Json::Value desired;
11591159
desired["ip_or_host_name"] = local_rpc_meta_.ip_or_host_name;
1160-
desired["rpc_port"] =
1161-
static_cast<Json::UInt64>(local_rpc_meta_.rpc_port);
1160+
desired["rpc_port"] = static_cast<Json::UInt>(local_rpc_meta_.rpc_port);
11621161
if (existing == desired) {
11631162
return 0;
11641163
}
1165-
storage_plugin_->remove(full_key);
11661164
}
11671165

11681166
LOG(INFO) << "Re-publishing RPC meta entry for " << server_name;
11691167
Json::Value rpcMetaJSON;
11701168
rpcMetaJSON["ip_or_host_name"] = local_rpc_meta_.ip_or_host_name;
1171-
rpcMetaJSON["rpc_port"] =
1172-
static_cast<Json::UInt64>(local_rpc_meta_.rpc_port);
1169+
rpcMetaJSON["rpc_port"] = static_cast<Json::UInt>(local_rpc_meta_.rpc_port);
11731170
if (!storage_plugin_->set(full_key, rpcMetaJSON)) {
11741171
LOG(ERROR) << "Failed to re-publish RPC meta entry for " << server_name;
11751172
return ERR_METADATA;

0 commit comments

Comments
 (0)