Skip to content

Commit 4701628

Browse files
bowenwang1996Bowen Wang
authored andcommitted
fix(gc): address slowness in forks cleaning (#3121)
Currently we always iterate from `tail` to `gc_stop_height` when cleaning forks, which is needlessly slow especially at the epoch beginning. This caused validators to potentially miss blocks at the beginning of an epoch when epoch length is 43k. To address this, we change it so that we only go through all heights in epoch once. More specifically, we maintain a fork tail that represents the height of the current fork cleaning process and when `gc_stop_height` changes, it changes to `gc_stop_height`. Otherwise it decreases monotonically from `gc_stop_height` to `tail`. To make this fully work, we also need to prevent blocks with height smaller than `gc_stop_height` from being accepted, which I find no problem with. Test plan ---------- * existing gc tests. * `test_block_height_too_old`.
1 parent 98674b1 commit 4701628

7 files changed

Lines changed: 159 additions & 26 deletions

File tree

chain/chain/src/chain.rs

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ const NEAR_BASE: Balance = 1_000_000_000_000_000_000_000_000;
7373
/// Number of epochs for which we keep store data
7474
pub const NUM_EPOCHS_TO_KEEP_STORE_DATA: u64 = 5;
7575

76+
/// Maximum number of height to go through at each step when cleaning forks during garbage collection.
77+
const GC_FORK_CLEAN_STEP: u64 = 1000;
78+
7679
enum ApplyChunksMode {
7780
ThisEpoch,
7881
NextEpoch,
@@ -519,20 +522,38 @@ impl Chain {
519522
_ => return Err(e),
520523
},
521524
};
525+
522526
if gc_stop_height > head.height {
523527
return Err(ErrorKind::GCError(
524528
"gc_stop_height cannot be larger than head.height".into(),
525529
)
526530
.into());
527531
}
532+
let prev_epoch_id = self.get_block_header(&head.prev_block_hash)?.epoch_id();
533+
let epoch_change = prev_epoch_id != &head.epoch_id;
534+
let fork_tail = if epoch_change {
535+
// if head doesn't change on the epoch boundary, we may update fork tail several times
536+
// but that is fine since it doesn't affect correctness and also we limit the number of
537+
// heights that fork cleaning goes through so it doesn't slow down client either.
538+
let mut chain_store_update = self.store.store_update();
539+
chain_store_update.update_fork_tail(gc_stop_height);
540+
chain_store_update.commit()?;
541+
gc_stop_height
542+
} else {
543+
self.store.fork_tail()?
544+
};
528545
let mut gc_blocks_remaining = gc_blocks_limit;
529546

530547
// Forks Cleaning
531-
for height in tail..gc_stop_height {
548+
let stop_height = std::cmp::max(tail, fork_tail.saturating_sub(GC_FORK_CLEAN_STEP));
549+
for height in (stop_height..fork_tail).rev() {
550+
self.clear_forks_data(tries.clone(), height, &mut gc_blocks_remaining)?;
532551
if gc_blocks_remaining == 0 {
533552
return Ok(());
534553
}
535-
self.clear_forks_data(tries.clone(), height, &mut gc_blocks_remaining)?;
554+
let mut chain_store_update = self.store.store_update();
555+
chain_store_update.update_fork_tail(height);
556+
chain_store_update.commit()?;
536557
}
537558

538559
// Canonical Chain Clearing
@@ -2855,6 +2876,7 @@ impl<'a> ChainUpdate<'a> {
28552876
let prev_gas_price = prev.gas_price();
28562877
let prev_epoch_id = prev.epoch_id().clone();
28572878
let prev_random_value = *prev.random_value();
2879+
let prev_height = prev.height();
28582880

28592881
// Block is an orphan if we do not know about the previous full block.
28602882
if !is_next && !self.chain_store_update.block_exists(&prev_hash)? {
@@ -2863,8 +2885,14 @@ impl<'a> ChainUpdate<'a> {
28632885

28642886
// A heuristic to prevent block height to jump too fast towards BlockHeight::max and cause
28652887
// overflow-related problems
2866-
if block.header().height() > head.height + self.epoch_length * 20 {
2867-
return Err(ErrorKind::InvalidBlockHeight.into());
2888+
let block_height = block.header().height();
2889+
if block_height > head.height + self.epoch_length * 20 {
2890+
return Err(ErrorKind::InvalidBlockHeight(block_height).into());
2891+
}
2892+
2893+
// Do not accept old forks
2894+
if prev_height < self.runtime_adapter.get_gc_stop_height(&head.last_block_hash)? {
2895+
return Err(ErrorKind::InvalidBlockHeight(prev_height).into());
28682896
}
28692897

28702898
let (is_caught_up, needs_to_start_fetching_state) =

chain/chain/src/error.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ use near_primitives::errors::{EpochError, StorageError};
1111
use near_primitives::hash::CryptoHash;
1212
use near_primitives::serialize::to_base;
1313
use near_primitives::sharding::{ChunkHash, ShardChunkHeader};
14-
use near_primitives::types::ShardId;
14+
use near_primitives::types::{BlockHeight, ShardId};
1515

1616
#[derive(Debug)]
1717
pub struct Error {
@@ -42,8 +42,8 @@ pub enum ErrorKind {
4242
#[fail(display = "Invalid Block Time: Too far in the future: {}", _0)]
4343
InvalidBlockFutureTime(DateTime<Utc>),
4444
/// Block height is invalid (not previous + 1).
45-
#[fail(display = "Invalid Block Height")]
46-
InvalidBlockHeight,
45+
#[fail(display = "Invalid Block Height {}", _0)]
46+
InvalidBlockHeight(BlockHeight),
4747
/// Invalid block proposed signature.
4848
#[fail(display = "Invalid Block Proposer Signature")]
4949
InvalidBlockProposer,
@@ -240,7 +240,7 @@ impl Error {
240240
| ErrorKind::DBNotFoundErr(_) => false,
241241
ErrorKind::InvalidBlockPastTime(_, _)
242242
| ErrorKind::InvalidBlockFutureTime(_)
243-
| ErrorKind::InvalidBlockHeight
243+
| ErrorKind::InvalidBlockHeight(_)
244244
| ErrorKind::InvalidBlockProposer
245245
| ErrorKind::InvalidChunk
246246
| ErrorKind::InvalidChunkProofs(_)

chain/chain/src/store.rs

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ use near_store::{
4242
ColReceiptIdToShardId, ColState, ColStateChanges, ColStateDlInfos, ColStateHeaders,
4343
ColStateParts, ColTransactionRefCount, ColTransactionResult, ColTransactions, ColTrieChanges,
4444
DBCol, KeyForStateChanges, ShardTries, Store, StoreUpdate, TrieChanges, WrappedTrieChanges,
45-
CHUNK_TAIL_KEY, HEADER_HEAD_KEY, HEAD_KEY, LARGEST_TARGET_HEIGHT_KEY, LATEST_KNOWN_KEY,
46-
SHOULD_COL_GC, SYNC_HEAD_KEY, TAIL_KEY,
45+
CHUNK_TAIL_KEY, FORK_TAIL_KEY, HEADER_HEAD_KEY, HEAD_KEY, LARGEST_TARGET_HEIGHT_KEY,
46+
LATEST_KNOWN_KEY, SHOULD_COL_GC, SYNC_HEAD_KEY, TAIL_KEY,
4747
};
4848

4949
use crate::byzantine_assert;
@@ -85,6 +85,8 @@ pub trait ChainStoreAccess {
8585
fn tail(&self) -> Result<BlockHeight, Error>;
8686
/// The chain Chunks Tail height.
8787
fn chunk_tail(&self) -> Result<BlockHeight, Error>;
88+
/// Tail height of the fork cleaning process.
89+
fn fork_tail(&self) -> Result<BlockHeight, Error>;
8890
/// Head of the header chain (not the same thing as head_header).
8991
fn header_head(&self) -> Result<Tip, Error>;
9092
/// The "sync" head: last header we received from syncing.
@@ -182,8 +184,9 @@ pub trait ChainStoreAccess {
182184
hash = *header.prev_hash();
183185
header = self.get_block_header(&hash)?;
184186
}
185-
if header.height() < height {
186-
return Err(ErrorKind::InvalidBlockHeight.into());
187+
let header_height = header.height();
188+
if header_height < height {
189+
return Err(ErrorKind::InvalidBlockHeight(header_height).into());
187190
}
188191
self.get_block_header(&hash)
189192
}
@@ -527,6 +530,13 @@ impl ChainStoreAccess for ChainStore {
527530
.map_err(|e| e.into())
528531
}
529532

533+
fn fork_tail(&self) -> Result<BlockHeight, Error> {
534+
self.store
535+
.get_ser(ColBlockMisc, FORK_TAIL_KEY)
536+
.map(|option| option.unwrap_or_else(|| self.genesis_height))
537+
.map_err(|e| e.into())
538+
}
539+
530540
/// The "sync" head: last header we received from syncing.
531541
fn sync_head(&self) -> Result<Tip, Error> {
532542
option_to_not_found(self.store.get_ser(ColBlockMisc, SYNC_HEAD_KEY), "SYNC_HEAD")
@@ -1138,6 +1148,7 @@ pub struct ChainStoreUpdate<'a> {
11381148
head: Option<Tip>,
11391149
tail: Option<BlockHeight>,
11401150
chunk_tail: Option<BlockHeight>,
1151+
fork_tail: Option<BlockHeight>,
11411152
header_head: Option<Tip>,
11421153
sync_head: Option<Tip>,
11431154
largest_target_height: Option<BlockHeight>,
@@ -1161,6 +1172,7 @@ impl<'a> ChainStoreUpdate<'a> {
11611172
head: None,
11621173
tail: None,
11631174
chunk_tail: None,
1175+
fork_tail: None,
11641176
header_head: None,
11651177
sync_head: None,
11661178
largest_target_height: None,
@@ -1249,6 +1261,15 @@ impl<'a> ChainStoreAccess for ChainStoreUpdate<'a> {
12491261
}
12501262
}
12511263

1264+
/// Fork tail used by GC
1265+
fn fork_tail(&self) -> Result<BlockHeight, Error> {
1266+
if let Some(fork_tail) = &self.fork_tail {
1267+
Ok(fork_tail.clone())
1268+
} else {
1269+
self.chain_store.fork_tail()
1270+
}
1271+
}
1272+
12521273
/// The "sync" head: last header we received from syncing.
12531274
fn sync_head(&self) -> Result<Tip, Error> {
12541275
if let Some(sync_head) = &self.sync_head {
@@ -1932,18 +1953,28 @@ impl<'a> ChainStoreUpdate<'a> {
19321953
pub fn reset_tail(&mut self) {
19331954
self.tail = None;
19341955
self.chunk_tail = None;
1956+
self.fork_tail = None;
19351957
}
19361958

19371959
pub fn update_tail(&mut self, height: BlockHeight) {
19381960
self.tail = Some(height);
19391961
let genesis_height = self.get_genesis_height();
1940-
let chunk_tail = self.chunk_tail().unwrap_or_else(|_| genesis_height);
1962+
// When fork tail is behind tail, it doesn't hurt to set it to tail for consistency.
1963+
if self.fork_tail.unwrap_or(genesis_height) < height {
1964+
self.fork_tail = Some(height);
1965+
}
1966+
1967+
let chunk_tail = self.chunk_tail().unwrap_or(genesis_height);
19411968
if chunk_tail == genesis_height {
19421969
// For consistency, Chunk Tail should be set if Tail is set
19431970
self.chunk_tail = Some(self.get_genesis_height());
19441971
}
19451972
}
19461973

1974+
pub fn update_fork_tail(&mut self, height: BlockHeight) {
1975+
self.fork_tail = Some(height);
1976+
}
1977+
19471978
pub fn update_chunk_tail(&mut self, height: BlockHeight) {
19481979
self.chunk_tail = Some(height);
19491980
}
@@ -2366,6 +2397,7 @@ impl<'a> ChainStoreUpdate<'a> {
23662397
Self::write_col_misc(&mut store_update, HEAD_KEY, &mut self.head)?;
23672398
Self::write_col_misc(&mut store_update, TAIL_KEY, &mut self.tail)?;
23682399
Self::write_col_misc(&mut store_update, CHUNK_TAIL_KEY, &mut self.chunk_tail)?;
2400+
Self::write_col_misc(&mut store_update, FORK_TAIL_KEY, &mut self.fork_tail)?;
23692401
Self::write_col_misc(&mut store_update, SYNC_HEAD_KEY, &mut self.sync_head)?;
23702402
Self::write_col_misc(&mut store_update, HEADER_HEAD_KEY, &mut self.header_head)?;
23712403
Self::write_col_misc(

chain/chain/src/store_validator/validate.rs

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ use near_primitives::utils::{get_block_shard_id, index_to_bytes};
1616
use near_store::{
1717
ColBlock, ColBlockHeader, ColBlockHeight, ColBlockInfo, ColBlockMisc, ColBlockPerHeight,
1818
ColChunkExtra, ColChunkHashesByHeight, ColChunks, ColOutcomesByBlockHash, ColStateHeaders,
19-
ColTransactionResult, DBCol, TrieChanges, TrieIterator, CHUNK_TAIL_KEY, HEADER_HEAD_KEY,
20-
HEAD_KEY, NUM_COLS, SHOULD_COL_GC, TAIL_KEY,
19+
ColTransactionResult, DBCol, TrieChanges, TrieIterator, CHUNK_TAIL_KEY, FORK_TAIL_KEY,
20+
HEADER_HEAD_KEY, HEAD_KEY, NUM_COLS, SHOULD_COL_GC, TAIL_KEY,
2121
};
2222

2323
use crate::StoreValidator;
@@ -107,6 +107,7 @@ macro_rules! unwrap_or_err_db {
107107
pub(crate) fn head_tail_validity(sv: &mut StoreValidator) -> Result<(), StoreValidatorError> {
108108
let mut tail = sv.config.genesis_height;
109109
let mut chunk_tail = sv.config.genesis_height;
110+
let mut fork_tail = sv.config.genesis_height;
110111
let tail_db = unwrap_or_err!(
111112
sv.store.get_ser::<BlockHeight>(ColBlockMisc, TAIL_KEY),
112113
"Can't get Tail from storage"
@@ -115,13 +116,21 @@ pub(crate) fn head_tail_validity(sv: &mut StoreValidator) -> Result<(), StoreVal
115116
sv.store.get_ser::<BlockHeight>(ColBlockMisc, CHUNK_TAIL_KEY),
116117
"Can't get Chunk Tail from storage"
117118
);
119+
let fork_tail_db = unwrap_or_err!(
120+
sv.store.get_ser::<BlockHeight>(ColBlockMisc, FORK_TAIL_KEY),
121+
"Can't get Chunk Tail from storage"
122+
);
118123
if tail_db.is_none() && chunk_tail_db.is_some() || tail_db.is_some() && chunk_tail_db.is_none()
119124
{
120125
err!("Tail is {:?} and Chunk Tail is {:?}", tail_db, chunk_tail_db);
121126
}
122-
if tail_db.is_some() && chunk_tail_db.is_some() {
127+
if tail_db.is_some() && fork_tail_db.is_none() {
128+
err!("Tail is {:?} but fork tail is None", tail_db);
129+
}
130+
if tail_db.is_some() {
123131
tail = tail_db.unwrap();
124132
chunk_tail = chunk_tail_db.unwrap();
133+
fork_tail = fork_tail_db.unwrap();
125134
}
126135
let head = unwrap_or_err_db!(
127136
sv.store.get_ser::<Tip>(ColBlockMisc, HEAD_KEY),
@@ -141,6 +150,12 @@ pub(crate) fn head_tail_validity(sv: &mut StoreValidator) -> Result<(), StoreVal
141150
if tail > head.height {
142151
err!("tail > head.height, {:?} > {:?}", tail, head);
143152
}
153+
if tail > fork_tail {
154+
err!("tail > fork_tail, {} > {}", tail, fork_tail);
155+
}
156+
if fork_tail > head.height {
157+
err!("fork tail > head.height, {} > {:?}", fork_tail, head);
158+
}
144159
if head.height > header_head.height {
145160
err!("head.height > header_head.height, {:?} > {:?}", tail, head);
146161
}

chain/client/tests/process_blocks.rs

Lines changed: 65 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -971,20 +971,25 @@ fn test_invalid_gas_price() {
971971
}
972972

973973
#[test]
974-
fn test_invalid_height() {
974+
fn test_invalid_height_too_large() {
975975
let mut env = TestEnv::new(ChainGenesis::test(), 1, 1);
976976
let b1 = env.clients[0].produce_block(1).unwrap().unwrap();
977977
let _ = env.clients[0].process_block(b1.clone(), Provenance::PRODUCED);
978978
let signer = InMemoryValidatorSigner::from_seed("test0", KeyType::ED25519, "test0");
979979
let b2 = Block::empty_with_height(&b1, std::u64::MAX, &signer);
980-
let (_, tip) = env.clients[0].process_block(b2, Provenance::NONE);
981-
match tip {
982-
Err(e) => match e.kind() {
983-
ErrorKind::InvalidBlockHeight => {}
984-
_ => assert!(false, "wrong error: {}", e),
985-
},
986-
_ => assert!(false, "succeeded, tip: {:?}", tip),
980+
let (_, res) = env.clients[0].process_block(b2, Provenance::NONE);
981+
assert!(matches!(res.unwrap_err().kind(), ErrorKind::InvalidBlockHeight(_)));
982+
}
983+
984+
#[test]
985+
fn test_invalid_height_too_old() {
986+
let mut env = TestEnv::new(ChainGenesis::test(), 1, 1);
987+
let b1 = env.clients[0].produce_block(1).unwrap().unwrap();
988+
for i in 2..100 {
989+
env.produce_block(0, i);
987990
}
991+
let (_, res) = env.clients[0].process_block(b1, Provenance::NONE);
992+
assert!(matches!(res.unwrap_err().kind(), ErrorKind::InvalidBlockHeight(_)));
988993
}
989994

990995
#[test]
@@ -1019,6 +1024,11 @@ fn test_gc_with_epoch_length_common(epoch_length: NumBlocks) {
10191024
for i in 1..=epoch_length * (NUM_EPOCHS_TO_KEEP_STORE_DATA + 1) {
10201025
let block = env.clients[0].produce_block(i).unwrap().unwrap();
10211026
env.process_block(0, block.clone(), Provenance::PRODUCED);
1027+
assert!(
1028+
env.clients[0].chain.store().fork_tail().unwrap()
1029+
<= env.clients[0].chain.store().tail().unwrap()
1030+
);
1031+
10221032
blocks.push(block);
10231033
}
10241034
for i in 1..=epoch_length * (NUM_EPOCHS_TO_KEEP_STORE_DATA + 1) {
@@ -1238,6 +1248,53 @@ fn test_gc_after_state_sync() {
12381248
assert!(env.clients[1].chain.clear_data(tries, 2).is_ok());
12391249
}
12401250

1251+
#[test]
1252+
fn test_gc_fork_tail() {
1253+
let epoch_length = 101;
1254+
let mut genesis = Genesis::test(vec!["test0", "test1"], 1);
1255+
genesis.config.epoch_length = epoch_length;
1256+
let runtimes: Vec<Arc<dyn RuntimeAdapter>> = vec![
1257+
Arc::new(neard::NightshadeRuntime::new(
1258+
Path::new("."),
1259+
create_test_store(),
1260+
Arc::new(genesis.clone()),
1261+
vec![],
1262+
vec![],
1263+
)),
1264+
Arc::new(neard::NightshadeRuntime::new(
1265+
Path::new("."),
1266+
create_test_store(),
1267+
Arc::new(genesis.clone()),
1268+
vec![],
1269+
vec![],
1270+
)),
1271+
];
1272+
let mut chain_genesis = ChainGenesis::test();
1273+
chain_genesis.epoch_length = epoch_length;
1274+
let mut env = TestEnv::new_with_runtime(chain_genesis.clone(), 2, 1, runtimes);
1275+
let b1 = env.clients[0].produce_block(1).unwrap().unwrap();
1276+
for i in 0..2 {
1277+
env.process_block(i, b1.clone(), Provenance::NONE);
1278+
}
1279+
// create 100 forks
1280+
for i in 2..102 {
1281+
let block = env.clients[0].produce_block(i).unwrap().unwrap();
1282+
env.process_block(1, block, Provenance::NONE);
1283+
}
1284+
for i in 102..epoch_length * NUM_EPOCHS_TO_KEEP_STORE_DATA + 5 {
1285+
let block = env.clients[0].produce_block(i).unwrap().unwrap();
1286+
for j in 0..2 {
1287+
env.process_block(j, block.clone(), Provenance::NONE);
1288+
}
1289+
}
1290+
let head = env.clients[1].chain.head().unwrap();
1291+
assert!(
1292+
env.clients[1].runtime_adapter.get_gc_stop_height(&head.last_block_hash).unwrap()
1293+
> epoch_length
1294+
);
1295+
assert_eq!(env.clients[1].chain.store().fork_tail().unwrap(), 3);
1296+
}
1297+
12411298
#[test]
12421299
fn test_tx_forwarding() {
12431300
let mut chain_genesis = ChainGenesis::test();

core/store/src/db.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ lazy_static! {
198198
pub const HEAD_KEY: &[u8; 4] = b"HEAD";
199199
pub const TAIL_KEY: &[u8; 4] = b"TAIL";
200200
pub const CHUNK_TAIL_KEY: &[u8; 10] = b"CHUNK_TAIL";
201+
pub const FORK_TAIL_KEY: &[u8; 9] = b"FORK_TAIL";
201202
pub const SYNC_HEAD_KEY: &[u8; 9] = b"SYNC_HEAD";
202203
pub const HEADER_HEAD_KEY: &[u8; 11] = b"HEADER_HEAD";
203204
pub const LATEST_KNOWN_KEY: &[u8; 12] = b"LATEST_KNOWN";

core/store/src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ use cached::{Cached, SizedCache};
1313

1414
pub use db::DBCol::{self, *};
1515
pub use db::{
16-
CHUNK_TAIL_KEY, HEADER_HEAD_KEY, HEAD_KEY, LARGEST_TARGET_HEIGHT_KEY, LATEST_KNOWN_KEY,
17-
NUM_COLS, SHOULD_COL_GC, SKIP_COL_GC, SYNC_HEAD_KEY, TAIL_KEY,
16+
CHUNK_TAIL_KEY, FORK_TAIL_KEY, HEADER_HEAD_KEY, HEAD_KEY, LARGEST_TARGET_HEIGHT_KEY,
17+
LATEST_KNOWN_KEY, NUM_COLS, SHOULD_COL_GC, SKIP_COL_GC, SYNC_HEAD_KEY, TAIL_KEY,
1818
};
1919
use near_crypto::PublicKey;
2020
use near_primitives::account::{AccessKey, Account};

0 commit comments

Comments
 (0)