Skip to content

Commit 93e5c77

Browse files
authored
[SharovBot] fix(txpool): evict zombie queued txns exceeding MaxNonceGap (#19449)
**[SharovBot]** ## Split from #19393 per @yperbasis review This PR contains **Bug #2 only** (zombie queued transaction eviction), extracted from #19393 which was asked to be split into separate PRs. ## Problem Queued transactions with an impossibly large nonce gap (e.g. on-chain nonce=281, queued nonce=16,814 — gap of 16,533) sit in the pool forever. They can never become pending without filling thousands of nonce positions first, causing unbounded queued pool bloat (4,000+ txns observed on Gnosis Chain, Erigon 3.3.8). The existing blob-txn nonce-gap eviction only covered `BlobTxnType`. Regular transactions had no gap limit. ## Fix - Add `MaxNonceGap uint64` to `txpoolcfg.Config` (default: 64) - Add `NonceTooDistant` (DiscardReason 37) for observability - In `onSenderStateChange`, evict txns whose nonce exceeds `noGapsNonce` by more than `MaxNonceGap` - `noGapsNonce` accounts for consecutive txns already pooled, so consecutive txns are never zombie-evicted - Fix `toDelReasons` parallel slice to track correct discard reason per evicted tx (was always logging `NonceTooLow`) ## Tests - `TestZombieQueuedEviction` — 3 sub-tests: 1. Zombie tx (gap=65 > MaxNonceGap=64) is evicted with `NonceTooDistant` 2. Tx at exactly MaxNonceGap boundary (gap=64) is kept 3. Consecutive txns beyond MaxNonceGap are never zombie-evicted ## Testing ``` go build ./txnprovider/txpool/... ✅ go test ./txnprovider/txpool/... -run TestZombieQueuedEviction -count=1 ✅ ``` ## Related - Bug #1 (stale pending / AuRa nonce) will be addressed separately per @yperbasis feedback - Backport to release/3.3 will follow once this is merged - Original combined PR: #19393
1 parent 1fc4ace commit 93e5c77

File tree

3 files changed

+189
-3
lines changed

3 files changed

+189
-3
lines changed

txnprovider/txpool/pool.go

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1863,12 +1863,22 @@ func (p *TxPool) onSenderStateChange(senderID uint64, senderNonce uint64, sender
18631863
cumulativeRequiredBalance := uint256.NewInt(0)
18641864
minFeeCap := uint256.NewInt(0).SetAllOne()
18651865
minTip := uint64(math.MaxUint64)
1866-
var toDel []*metaTxn // can't delete items while iterate them
1866+
var toDel []*metaTxn // can't delete items while iterate them
1867+
var toDelReasons []txpoolcfg.DiscardReason // parallel reasons slice for toDel
18671868

18681869
p.all.ascend(senderID, func(mt *metaTxn) bool {
18691870
deleteAndContinueReasonLog := ""
1871+
discardReason := txpoolcfg.NonceTooLow
18701872
if senderNonce > mt.TxnSlot.Nonce {
18711873
deleteAndContinueReasonLog = "low nonce"
1874+
} else if p.cfg.MaxNonceGap > 0 && mt.TxnSlot.Nonce > noGapsNonce && mt.TxnSlot.Nonce-noGapsNonce > p.cfg.MaxNonceGap {
1875+
// Evict "zombie" queued transactions whose nonce is so far ahead of the sender's
1876+
// on-chain nonce (accounting for any consecutive txns already in the pool) that they
1877+
// can practically never become pending. This prevents unbounded pool bloat from accounts
1878+
// that submitted transactions with impossibly large nonce gaps (e.g. nonce 144968 when
1879+
// on-chain nonce is 6398). The gap threshold is configurable via MaxNonceGap (default 64).
1880+
deleteAndContinueReasonLog = "nonce gap too large"
1881+
discardReason = txpoolcfg.NonceTooDistant
18721882
} else if mt.TxnSlot.Nonce != noGapsNonce && mt.TxnSlot.Type == BlobTxnType { // Discard nonce-gapped blob txns
18731883
deleteAndContinueReasonLog = "nonce-gapped blob txn"
18741884
}
@@ -1888,6 +1898,7 @@ func (p *TxPool) onSenderStateChange(senderID uint64, senderNonce uint64, sender
18881898
//already removed
18891899
}
18901900
toDel = append(toDel, mt)
1901+
toDelReasons = append(toDelReasons, discardReason)
18911902
return true
18921903
}
18931904

@@ -1956,8 +1967,8 @@ func (p *TxPool) onSenderStateChange(senderID uint64, senderNonce uint64, sender
19561967
return true
19571968
})
19581969

1959-
for _, mt := range toDel {
1960-
p.discardLocked(mt, txpoolcfg.NonceTooLow)
1970+
for i, mt := range toDel {
1971+
p.discardLocked(mt, toDelReasons[i])
19611972
}
19621973

19631974
logger.Trace("[txpool] onSenderStateChange", "sender", senderID, "count", p.all.count(senderID), "pending", p.pending.Len(), "baseFee", p.baseFee.Len(), "queued", p.queued.Len())

txnprovider/txpool/pool_test.go

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1815,3 +1815,167 @@ func BenchmarkProcessRemoteTxns(b *testing.B) {
18151815
pending, baseFee, queued := pool.CountContent()
18161816
b.Logf("Final pool stats - pending: %d, baseFee: %d, queued: %d", pending, baseFee, queued)
18171817
}
1818+
1819+
// TestZombieQueuedEviction verifies that queued transactions whose nonce is so far ahead of
1820+
// the sender's on-chain nonce that they can never become pending are evicted from the pool.
1821+
// This covers Bug #2: "zombie" queued txns on Gnosis Chain (e.g. on-chain nonce=281 but
1822+
// queued nonce=16814, a gap of 16,533 that can never be filled).
1823+
func TestZombieQueuedEviction(t *testing.T) {
1824+
assert, require := assert.New(t), require.New(t)
1825+
ch := make(chan Announcements, 100)
1826+
coreDB := temporaltest.NewTestDB(t, datadir.New(t.TempDir()))
1827+
db := memdb.NewTestPoolDB(t)
1828+
ctx, cancel := context.WithCancel(context.Background())
1829+
t.Cleanup(cancel)
1830+
1831+
cfg := txpoolcfg.DefaultConfig
1832+
cfg.MaxNonceGap = 64 // explicit, same as default
1833+
sendersCache := kvcache.New(kvcache.DefaultCoherentConfig)
1834+
pool, err := New(ctx, ch, db, coreDB, cfg, sendersCache, chain.TestChainConfig, nil, nil, func() {}, nil, nil, log.New(), WithFeeCalculator(nil))
1835+
require.NoError(err)
1836+
require.NotNil(pool)
1837+
1838+
pendingBaseFee := uint64(200_000)
1839+
h1 := gointerfaces.ConvertHashToH256([32]byte{})
1840+
var senderAddr [20]byte
1841+
senderAddr[0] = 0x42
1842+
1843+
// Set sender's on-chain nonce = 5
1844+
acc := accounts3.Account{
1845+
Nonce: 5,
1846+
Balance: *uint256.NewInt(1 * common.Ether),
1847+
CodeHash: accounts.EmptyCodeHash,
1848+
Incarnation: 0,
1849+
}
1850+
v := accounts3.SerialiseV3(&acc)
1851+
change := &remoteproto.StateChangeBatch{
1852+
StateVersionId: 0,
1853+
PendingBlockBaseFee: pendingBaseFee,
1854+
BlockGasLimit: 1_000_000,
1855+
ChangeBatch: []*remoteproto.StateChange{
1856+
{
1857+
BlockHeight: 0,
1858+
BlockHash: h1,
1859+
Changes: []*remoteproto.AccountChange{
1860+
{
1861+
Action: remoteproto.Action_UPSERT,
1862+
Address: gointerfaces.ConvertAddressToH160(senderAddr),
1863+
Data: v,
1864+
},
1865+
},
1866+
},
1867+
},
1868+
}
1869+
require.NoError(pool.OnNewBlock(ctx, change, TxnSlots{}, TxnSlots{}, TxnSlots{}))
1870+
1871+
t.Run("zombie tx with impossible nonce gap is evicted", func(t *testing.T) {
1872+
// nonce = 5 + 64 + 1 = 70, gap=65 > MaxNonceGap(64)
1873+
zombieNonce := uint64(5 + cfg.MaxNonceGap + 1)
1874+
var txnSlots TxnSlots
1875+
slot := &TxnSlot{
1876+
Tip: *uint256.NewInt(300_000),
1877+
FeeCap: *uint256.NewInt(300_000),
1878+
Gas: 100_000,
1879+
Nonce: zombieNonce,
1880+
}
1881+
slot.IDHash[0] = 0xAA
1882+
txnSlots.Append(slot, senderAddr[:], true)
1883+
1884+
reasons, err := pool.AddLocalTxns(ctx, txnSlots)
1885+
require.NoError(err)
1886+
require.Len(reasons, 1)
1887+
assert.Equal(txpoolcfg.NonceTooDistant, reasons[0],
1888+
"zombie tx (nonce gap %d > MaxNonceGap %d) should be evicted with NonceTooDistant",
1889+
zombieNonce-5, cfg.MaxNonceGap)
1890+
1891+
_, _, queued := pool.CountContent()
1892+
assert.Equal(0, queued, "queued pool should be empty after zombie eviction")
1893+
})
1894+
1895+
t.Run("tx at exactly MaxNonceGap boundary is kept", func(t *testing.T) {
1896+
// nonce = 5 + 64 = 69, gap=64 == MaxNonceGap (NOT evicted)
1897+
boundaryNonce := uint64(5 + cfg.MaxNonceGap)
1898+
var txnSlots TxnSlots
1899+
slot := &TxnSlot{
1900+
Tip: *uint256.NewInt(300_000),
1901+
FeeCap: *uint256.NewInt(300_000),
1902+
Gas: 100_000,
1903+
Nonce: boundaryNonce,
1904+
}
1905+
slot.IDHash[0] = 0xBB
1906+
txnSlots.Append(slot, senderAddr[:], true)
1907+
1908+
reasons, err := pool.AddLocalTxns(ctx, txnSlots)
1909+
require.NoError(err)
1910+
require.Len(reasons, 1)
1911+
// gap = boundaryNonce - noGapsNonce. noGapsNonce=5 (no consecutive txns).
1912+
// 64 is NOT > 64, so the tx should be kept (in queued due to nonce gap).
1913+
assert.Equal(txpoolcfg.Success, reasons[0],
1914+
"tx at exactly MaxNonceGap boundary (gap=%d) should be accepted", cfg.MaxNonceGap)
1915+
})
1916+
1917+
t.Run("consecutive txns beyond MaxNonceGap are kept", func(t *testing.T) {
1918+
// If there are consecutive txns 5, 6, 7, ..., 5+MaxNonceGap+5, they should all be kept
1919+
// because the gap from noGapsNonce is always 0 for consecutive txns.
1920+
var txnSlots TxnSlots
1921+
baseNonce := uint64(5)
1922+
// Clear the pool first by using a fresh pool
1923+
ch2 := make(chan Announcements, 100)
1924+
coreDB2 := temporaltest.NewTestDB(t, datadir.New(t.TempDir()))
1925+
db2 := memdb.NewTestPoolDB(t)
1926+
cfg2 := txpoolcfg.DefaultConfig
1927+
cfg2.MaxNonceGap = 10 // small gap for this test
1928+
pool2, err := New(ctx, ch2, db2, coreDB2, cfg2, kvcache.New(kvcache.DefaultCoherentConfig),
1929+
chain.TestChainConfig, nil, nil, func() {}, nil, nil, log.New(), WithFeeCalculator(nil))
1930+
require.NoError(err)
1931+
1932+
acc2 := accounts3.Account{
1933+
Nonce: baseNonce,
1934+
Balance: *uint256.NewInt(10 * common.Ether),
1935+
CodeHash: accounts.EmptyCodeHash,
1936+
}
1937+
v2 := accounts3.SerialiseV3(&acc2)
1938+
var addr2 [20]byte
1939+
addr2[0] = 0x99
1940+
change2 := &remoteproto.StateChangeBatch{
1941+
StateVersionId: 0,
1942+
PendingBlockBaseFee: pendingBaseFee,
1943+
BlockGasLimit: 1_000_000,
1944+
ChangeBatch: []*remoteproto.StateChange{
1945+
{
1946+
BlockHeight: 0,
1947+
BlockHash: gointerfaces.ConvertHashToH256([32]byte{1}),
1948+
Changes: []*remoteproto.AccountChange{
1949+
{Action: remoteproto.Action_UPSERT, Address: gointerfaces.ConvertAddressToH160(addr2), Data: v2},
1950+
},
1951+
},
1952+
},
1953+
}
1954+
require.NoError(pool2.OnNewBlock(ctx, change2, TxnSlots{}, TxnSlots{}, TxnSlots{}))
1955+
1956+
// Add consecutive txns: nonces 5, 6, 7, ..., 5+MaxNonceGap+5 = 20
1957+
count := int(cfg2.MaxNonceGap + 5 + 1)
1958+
for i := 0; i < count; i++ {
1959+
txnSlots.Txns = nil
1960+
txnSlots.Senders = txnSlots.Senders[:0]
1961+
txnSlots.IsLocal = txnSlots.IsLocal[:0]
1962+
slot := &TxnSlot{
1963+
Tip: *uint256.NewInt(300_000),
1964+
FeeCap: *uint256.NewInt(300_000),
1965+
Gas: 100_000,
1966+
Nonce: baseNonce + uint64(i),
1967+
}
1968+
slot.IDHash[0] = uint8(0xC0 + i)
1969+
txnSlots.Append(slot, addr2[:], true)
1970+
reasons, err := pool2.AddLocalTxns(ctx, txnSlots)
1971+
require.NoError(err)
1972+
assert.Equal(txpoolcfg.Success, reasons[0],
1973+
"consecutive tx nonce=%d should not be evicted (gap from noGapsNonce is 0)", baseNonce+uint64(i))
1974+
}
1975+
// All consecutive txns (no nonce gaps) should be accepted and promoted to pending.
1976+
// The key check is that NONE were evicted with NonceTooDistant — verified above per-tx.
1977+
pending2, _, queued2 := pool2.CountContent()
1978+
assert.Equal(0, queued2, "no consecutive txns should be zombie-evicted (queued should be drained to pending)")
1979+
assert.Equal(count, pending2, "all consecutive txns should be pending (no gaps, sufficient balance)")
1980+
})
1981+
}

txnprovider/txpool/txpoolcfg/txpoolcfg.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,12 @@ type Config struct {
4141
TotalBlobPoolLimit uint64 // Total number of blobs (not txns) allowed within the txpool
4242
PriceBump uint64 // Price bump percentage to replace an already existing transaction
4343
BlobPriceBump uint64 //Price bump percentage to replace an existing 4844 blob txn (type-3)
44+
// MaxNonceGap is the maximum allowed gap between a sender's on-chain nonce and the nonce of a
45+
// queued transaction. Transactions whose nonce exceeds the on-chain nonce by more than this
46+
// value are considered "zombie" transactions that can never become pending (they would require
47+
// an impossibly large number of preceding transactions to fill the gap), and are evicted from
48+
// the pool. A value of 0 disables this eviction. Default is 64.
49+
MaxNonceGap uint64
4450

4551
// regular batch tasks processing
4652
SyncToNewPeersEvery time.Duration
@@ -77,6 +83,8 @@ var DefaultConfig = Config{
7783
PriceBump: 10, // Price bump percentage to replace an already existing transaction
7884
BlobPriceBump: 100,
7985

86+
MaxNonceGap: 64, // Evict queued txns with a nonce gap > 64 from the on-chain nonce
87+
8088
NoGossip: false,
8189
MdbxWriteMap: false,
8290
}
@@ -121,6 +129,7 @@ const (
121129
ErrAuthorityReserved DiscardReason = 34 // EIP-7702 transaction with authority already reserved
122130
InvalidAA DiscardReason = 35 // Invalid RIP-7560 transaction
123131
ErrGetCode DiscardReason = 36 // Error getting code during AA validation
132+
NonceTooDistant DiscardReason = 37 // Nonce gap between tx and on-chain nonce exceeds MaxNonceGap; tx can never become pending
124133
)
125134

126135
func (r DiscardReason) String() string {
@@ -199,6 +208,8 @@ func (r DiscardReason) String() string {
199208
return "RIP-7560 transaction failed validation"
200209
case ErrGetCode:
201210
return "error getting account code during RIP-7560 validation"
211+
case NonceTooDistant:
212+
return "nonce gap too large: transaction nonce is too far ahead of sender's on-chain nonce"
202213
default:
203214
panic(fmt.Sprintf("discard reason: %d", r))
204215
}

0 commit comments

Comments
 (0)