@@ -316,16 +316,21 @@ std::shared_ptr< homestore::snapshot_context > ReplicationStateMachine::last_sna
316
316
317
317
int ReplicationStateMachine::read_snapshot_obj (std::shared_ptr< homestore::snapshot_context > context,
318
318
std::shared_ptr< homestore::snapshot_obj > snp_obj) {
319
- HSHomeObject::PGBlobIterator* pg_iter = nullptr ;
320
-
321
- if (snp_obj->user_ctx == nullptr ) {
322
- // Create the pg blob iterator for the first time.
323
- pg_iter = new HSHomeObject::PGBlobIterator (*home_object_, repl_dev ()->group_id (), context->get_lsn ());
324
- snp_obj->user_ctx = (void *)pg_iter;
325
- LOGD (" Allocated new pg blob iterator={}, group={}, lsn={}" , static_cast < void * >(pg_iter),
326
- boost::uuids::to_string (repl_dev ()->group_id ()), context->get_lsn ());
327
- } else {
328
- pg_iter = r_cast< HSHomeObject::PGBlobIterator* >(snp_obj->user_ctx );
319
+ std::shared_ptr< HSHomeObject::PGBlobIterator > pg_iter;
320
+ {
321
+ std::lock_guard lk (m_snp_sync_ctx_lock);
322
+ if (snp_obj->user_ctx == nullptr ) {
323
+ // Create the pg blob iterator for the first time.
324
+ pg_iter = std::make_shared< HSHomeObject::PGBlobIterator >(*home_object_, repl_dev ()->group_id (),
325
+ context->get_lsn ());
326
+ auto pg_iter_ptr = new std::shared_ptr< HSHomeObject::PGBlobIterator >(pg_iter);
327
+ snp_obj->user_ctx = static_cast < void * >(pg_iter_ptr);
328
+ LOGD (" Allocated new pg blob iterator={}, group={}, lsn={}" , snp_obj->user_ctx ,
329
+ boost::uuids::to_string (repl_dev ()->group_id ()), context->get_lsn ());
330
+ } else {
331
+ auto pg_iter_ptr = static_cast < std::shared_ptr< HSHomeObject::PGBlobIterator >* >(snp_obj->user_ctx );
332
+ pg_iter = *pg_iter_ptr;
333
+ }
329
334
}
330
335
331
336
// Nuraft uses obj_id as a way to track the state of the snapshot read and write.
@@ -357,6 +362,17 @@ int ReplicationStateMachine::read_snapshot_obj(std::shared_ptr< homestore::snaps
357
362
LOGW (" Invalid objId in snapshot read, {}, current shard_seq_num={}, current batch_num={}" , log_str,
358
363
pg_iter->cur_obj_id_ .shard_seq_num , pg_iter->cur_obj_id_ .batch_id );
359
364
return -1 ;
365
+ // There is a known cornor case(not sure if it is the only case): If free_user_snp_ctx and read_snapshot_obj(we
366
+ // enable nuraft bg snapshot) occur at the same time, and free_user_snp_ctx is called first, pg_iter is
367
+ // released, and then in read_snapshot_obj, pg_iter will be created with cur_obj_id_ = 0|0 while the
368
+ // next_obj_id will be x|y which may hit into invalid objId condition.
369
+ // If inconsistency happens, reset the cursor to the beginning(0|0), and let follower to validate(lsn may change) and reset
370
+ // its cursor to the checkpoint to proceed with snapshot resync.
371
+ LOGW (" Invalid objId in snapshot read, {}, current shard_seq_num={}, current batch_num={}, reset cursor to the "
372
+ " beginning" ,
373
+ log_str, pg_iter->cur_obj_id_ .shard_seq_num , pg_iter->cur_obj_id_ .batch_id );
374
+ pg_iter->reset_cursor ();
375
+ return 0 ;
360
376
}
361
377
362
378
// pg metadata message
@@ -532,11 +548,11 @@ void ReplicationStateMachine::free_user_snp_ctx(void*& user_snp_ctx) {
532
548
LOGE (" User snapshot context null group={}" , boost::uuids::to_string (repl_dev ()->group_id ()));
533
549
return ;
534
550
}
535
-
536
- auto pg_iter = r_cast< HSHomeObject::PGBlobIterator* >(user_snp_ctx);
537
- LOGD (" Freeing snapshot iterator={}, pg={} group={}" , static_cast < void * >(pg_iter), pg_iter ->pg_id_ ,
538
- boost::uuids::to_string (pg_iter ->group_id_ ));
539
- delete pg_iter ;
551
+ std::lock_guard lk (m_snp_sync_ctx_lock);
552
+ auto pg_iter_ptr = static_cast <std::shared_ptr< HSHomeObject::PGBlobIterator>* >(user_snp_ctx);
553
+ LOGD (" Freeing snapshot iterator={}, pg={} group={}" , user_snp_ctx, (*pg_iter_ptr) ->pg_id_ ,
554
+ boost::uuids::to_string ((*pg_iter_ptr) ->group_id_ ));
555
+ delete pg_iter_ptr ;
540
556
user_snp_ctx = nullptr ;
541
557
}
542
558
0 commit comments