Skip to content

Commit 3f60adf

Browse files
handling config change where we also need to check no_space_left error info (#293)
1 parent be71140 commit 3f60adf

File tree

3 files changed

+59
-61
lines changed

3 files changed

+59
-61
lines changed

conanfile.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
class HomeObjectConan(ConanFile):
1111
name = "homeobject"
12-
version = "2.3.15"
12+
version = "2.3.16"
1313

1414
homepage = "https://github.com/eBay/HomeObject"
1515
description = "Blob Store built on HomeReplication"
@@ -49,7 +49,7 @@ def build_requirements(self):
4949

5050
def requirements(self):
5151
self.requires("sisl/[^12.2]@oss/master", transitive_headers=True)
52-
self.requires("homestore/[^6.12]@oss/master")
52+
self.requires("homestore/[^6.13]@oss/master")
5353
self.requires("iomgr/[^11.3]@oss/master")
5454
self.requires("lz4/1.9.4", override=True)
5555
self.requires("openssl/3.3.1", override=True)

src/lib/homestore_backend/replication_state_machine.cpp

Lines changed: 39 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,10 @@ void ReplicationStateMachine::on_commit(int64_t lsn, const sisl::blob& header, c
4040
break;
4141
}
4242
}
43+
}
4344

45+
void ReplicationStateMachine::notify_committed_lsn(int64_t lsn) {
46+
LOGD("got committed lsn notification , lsn={}", lsn);
4447
// handle no_space_left error if we have any
4548
const auto [target_lsn, chunk_id] = get_no_space_left_error_info();
4649
if (std::numeric_limits< homestore::repl_lsn_t >::max() == target_lsn) {
@@ -64,8 +67,7 @@ void ReplicationStateMachine::on_commit(int64_t lsn, const sisl::blob& header, c
6467
}
6568

6669
if (target_lsn == lsn) {
67-
// check if there is pending no_space_left error to be handled. only follower will handle this
68-
LOGD("handle no_space_left_error_info, lsn={}, chunk_id={}", lsn, chunk_id);
70+
LOGD("match no_space_left_error_info, lsn={}, chunk_id={}", lsn, chunk_id);
6971
handle_no_space_left(lsn, chunk_id);
7072
reset_no_space_left_error_info();
7173
}
@@ -134,13 +136,29 @@ void ReplicationStateMachine::on_rollback(int64_t lsn, sisl::blob const& header,
134136
}
135137
}
136138

137-
// cancel no_space_left_error_info if matches
138139
const auto [target_lsn, chunk_id] = get_no_space_left_error_info();
139-
if (target_lsn >= lsn) {
140-
LOGD("cancel no_space_left_error_info, wait_commit_lsn={}, chunk_id={}, currrent lsn={}", target_lsn, chunk_id,
141-
lsn);
142-
reset_no_space_left_error_info();
143-
}
140+
141+
RELEASE_ASSERT(
142+
target_lsn >= lsn,
143+
"wait_commit_lsn should be bigger than rollbacked lsn wait_commit_lsn={}, chunk_id={}, currrent lsn={}",
144+
target_lsn, chunk_id, lsn);
145+
146+
// if target_lsn is int64_max, it`s is also ok to reset_no_space_left_error_info
147+
reset_no_space_left_error_info();
148+
}
149+
150+
void ReplicationStateMachine::on_config_rollback(int64_t lsn) {
151+
LOGD("rollback config at lsn={}", lsn);
152+
153+
const auto [target_lsn, chunk_id] = get_no_space_left_error_info();
154+
155+
RELEASE_ASSERT(
156+
target_lsn >= lsn,
157+
"wait_commit_lsn should be bigger than rollbacked lsn wait_commit_lsn={}, chunk_id={}, currrent lsn={}",
158+
target_lsn, chunk_id, lsn);
159+
160+
// if target_lsn is int64_max, it`s is also ok to reset_no_space_left_error_info
161+
reset_no_space_left_error_info();
144162
}
145163

146164
void ReplicationStateMachine::on_restart() { LOGD("ReplicationStateMachine::on_restart"); }
@@ -807,61 +825,23 @@ void HSHomeObject::on_snp_ctx_meta_blk_recover_completed(bool success) {
807825
LOGI("Snapshot context meta blk recover completed");
808826
}
809827

810-
// on_no_space_left will be called in raft channel, which will blocking new logs to come in.
811-
// here, we need to check if all the appended logs have been already committed.
812-
813-
// if yes, we can guarantee all the index entry of all the logs have already been written to pg index table(which is
814-
// used by gc thread to identify valid blobs), so we can handle the no_space_left error directly.
815-
816-
// if not, we need to set an no_space_left error info and wait for the commit thread to handle this error after the
817-
// lsn-1 is committed.
818-
819828
void ReplicationStateMachine::on_no_space_left(homestore::repl_lsn_t lsn, homestore::chunk_num_t chunk_id) {
820829
LOGD("got no_space_left error at lsn={}, chunk_id={}", lsn, chunk_id);
821830

822-
// we need to pause statemachin here before we check and compare last_append_lsn and last_commit_lsn, since there is
823-
// very corner case that when we do this, the last_append_lsn is being committed. for example, last_append_lsn is 10
824-
// and last_commit_lsn is 9, so we will set an erro info and wait for the commit thread to handle it after lsn 10
825-
// is committed. but lsn 10 is probably being committed now, if the commit is completed after the comparation but
826-
// before we set the error info, then commit thread will never handle the error info, since all the appended log
827-
// have been committed and new more new logs can be appended, so on_commit will never be trigger again.
828-
repl_dev()->pause_statemachine();
829-
830-
// basically, there are three cases:
831-
// 1. if there is some appended logs that have not been committed, we need to set an error info and delegate
832-
// handle_no_space_left to on_commit.
833-
// 2. if all the appended logs have been committed, but the lsn, which no_space_left is triggered, is exactly the
834-
// last_append_lsn + 1, we must handle_no_space_left inline.
835-
// 3. if all the appended logs have been committed, but the lsn, which no_space_left is triggered, is larger than
836-
// last_append_lsn + 1, we can select anyone of both, handle_no_space_left inline or set an error info or delegate
837-
// it to on_commit.
838-
839-
// last_append_lsn here means the raft_lsn of the last log in logstore.
840-
const auto last_append_lsn = repl_dev()->get_last_append_lsn();
841-
const auto last_commit_lsn = repl_dev()->get_last_commit_lsn();
842-
LOGD("last_append_lsn={}, last_commit_lsn={}", last_append_lsn, last_commit_lsn);
843-
if (last_append_lsn == last_commit_lsn) {
844-
// if last_append_lsn == last_commit_lsn, it means all the logs have been committed, so we have to handle it
845-
// here, since on_commit will never be triggered from now on.
846-
LOGD("all the committed logs have been committed, handle no_space_left directly!");
847-
handle_no_space_left(lsn, chunk_id);
848-
} else {
849-
const auto [error_lsn, error_chunk_id] = get_no_space_left_error_info();
850-
if (lsn - 1 < error_lsn) {
851-
/*set a new error info or overwrite an existing error info*/
852-
// we wait for the commit thread to handle this error after lsn - 1 is committed.
853-
LOGD("wait for the commit thread to handle no_space_left error after lsn {} is committed, "
854-
"last_append_lsn {}, last_commit_lsn {}",
855-
lsn - 1, last_append_lsn, last_commit_lsn);
856-
set_no_space_left_error_info(lsn - 1, error_chunk_id);
857-
} else {
858-
LOGD("got no_space_left error but my expected lsn {} is larger than existing error info lsn {}, "
859-
"ignore it!",
860-
lsn - 1, error_lsn);
861-
}
862-
}
831+
const auto [target_lsn, error_chunk_id] = get_no_space_left_error_info();
832+
833+
RELEASE_ASSERT(lsn - 1 <= target_lsn,
834+
"new target lsn should be less than or equal to the existing target "
835+
"lsn, new_target_lsn={}, existing_target_lsn={}",
836+
lsn - 1, target_lsn);
837+
838+
// set a new error info or overwrite an existing error info, postpone handling this error until lsn - 1 is
839+
// committed.
840+
LOGD("set no_space_left error info with lsn={}, chunk_id={}, existing error info: lsn={}, chunk_id={}", lsn - 1,
841+
chunk_id, target_lsn, error_chunk_id);
863842

864-
repl_dev()->resume_statemachine();
843+
// setting the same error info is ok.
844+
set_no_space_left_error_info(lsn - 1, chunk_id);
865845
}
866846

867847
void ReplicationStateMachine::set_no_space_left_error_info(homestore::repl_lsn_t lsn, homestore::chunk_num_t chunk_id) {

src/lib/homestore_backend/replication_state_machine.hpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,24 @@ class ReplicationStateMachine : public homestore::ReplDevListener {
202202
// @param chunk_id - on which chunk no_space_left happened
203203
void on_no_space_left(homestore::repl_lsn_t lsn, homestore::chunk_num_t chunk_id) override;
204204

205+
/// @brief Called when the config log entry has been rolled backed.
206+
///
207+
/// This function is called on followers only when the log entry is going to be overwritten. This function is called
208+
/// from a random worker thread, but is guaranteed to be serialized.
209+
///
210+
/// For each config log index, it is guaranteed that either on_config_commit() or on_config_rollback() is called but
211+
/// not both.
212+
/// @param lsn - The log sequence number of the rollbacked config log entry
213+
void on_config_rollback(int64_t lsn) override;
214+
215+
/// @brief periodically called to notify the lastest committed lsn to the listener.
216+
/// NOTE: this callback will block the thread of flushing the latest committed lsn into repl_dev superblk as DC_LSN,
217+
/// pls take care if there is any heavy or blocking operation in this callback.
218+
///
219+
/// @param lsn - The lasted committed log sequence number so far
220+
///
221+
void notify_committed_lsn(int64_t lsn) override;
222+
205223
private:
206224
HSHomeObject* home_object_{nullptr};
207225

0 commit comments

Comments
 (0)