11import { createLogger } from '@aztec/foundation/log' ;
22import { retryUntil } from '@aztec/foundation/retry' ;
33
4- import { AlertTriggeredError , GrafanaClient } from '../quality_of_service/grafana_client.js' ;
4+ import { AlertTriggeredError } from '../quality_of_service/grafana_client.js' ;
55import {
66 ChainHealth ,
77 type ServiceEndpoint ,
88 applyProverBrokerKill ,
99 applyProverKill ,
10+ createResilientPrometheusConnection ,
1011 deleteResourceByLabel ,
1112 getGitProjectRoot ,
13+ getRPCEndpoint ,
1214 setupEnvironment ,
13- startPortForward ,
15+ waitForProvenToAdvance ,
1416} from './utils.js' ;
1517
1618const config = setupEnvironment ( process . env ) ;
1719
1820const logger = createLogger ( 'e2e:spartan-test:prover-node' ) ;
1921
2022const epochDurationSeconds = config . AZTEC_EPOCH_DURATION * config . AZTEC_SLOT_DURATION ;
23+ const slotDurationSeconds = config . AZTEC_SLOT_DURATION ;
2124
2225/**
2326 * This test aims to check that a prover node is able to recover after a crash.
@@ -55,49 +58,20 @@ const enqueuedRootRollupJobs = {
5558
5659describe ( 'prover node recovery' , ( ) => {
5760 const endpoints : ServiceEndpoint [ ] = [ ] ;
58- let alertChecker : GrafanaClient ;
61+ let runAlertCheck : ReturnType < typeof createResilientPrometheusConnection > [ 'runAlertCheck' ] ;
5962 let spartanDir : string ;
63+ let rpcEndpoint : ServiceEndpoint ;
6064 const health = new ChainHealth ( config . NAMESPACE , logger ) ;
6165
6266 beforeAll ( async ( ) => {
6367 await health . setup ( ) ;
64- // Try Prometheus in a dedicated metrics namespace first; if not present, fall back to the network namespace
65- let promPort = 0 ;
66- let promUrl = '' ;
67- let promProc : Awaited < ReturnType < typeof startPortForward > > [ 'process' ] ;
68- {
69- const result = await startPortForward ( {
70- resource : `svc/metrics-prometheus-server` ,
71- namespace : 'metrics' ,
72- containerPort : 80 ,
73- } ) ;
74- promProc = result . process ;
75- promPort = result . port ;
76- promUrl = `http://127.0.0.1:${ promPort } /api/v1` ;
77- if ( promPort === 0 ) {
78- result . process . kill ( ) ;
79- }
80- }
81-
82- if ( promPort === 0 ) {
83- const result = await startPortForward ( {
84- resource : `svc/prometheus-server` ,
85- namespace : config . NAMESPACE ,
86- containerPort : 80 ,
87- } ) ;
88- promProc = result . process ;
89- promPort = result . port ;
90- promUrl = `http://127.0.0.1:${ promPort } /api/v1` ;
91- }
92-
93- if ( ! promProc || promPort === 0 ) {
94- throw new Error ( 'Unable to port-forward to Prometheus. Ensure the metrics stack is deployed.' ) ;
95- }
96-
97- endpoints . push ( { url : promUrl , process : promProc } ) ;
98- const grafanaEndpoint = promUrl ;
99- const grafanaCredentials = '' ;
100- alertChecker = new GrafanaClient ( logger , { grafanaEndpoint, grafanaCredentials } ) ;
68+
69+ rpcEndpoint = await getRPCEndpoint ( config . NAMESPACE ) ;
70+ endpoints . push ( rpcEndpoint ) ;
71+
72+ const prometheus = createResilientPrometheusConnection ( config . NAMESPACE , endpoints , logger ) ;
73+ await prometheus . connect ( ) ;
74+ runAlertCheck = prometheus . runAlertCheck ;
10175
10276 spartanDir = `${ getGitProjectRoot ( ) } /spartan` ;
10377 } ) ;
@@ -120,7 +94,7 @@ describe('prover node recovery', () => {
12094 await retryUntil (
12195 async ( ) => {
12296 try {
123- await alertChecker . runAlertCheck ( [ enqueuedBlockRollupJobs ] ) ;
97+ await runAlertCheck ( [ enqueuedBlockRollupJobs ] ) ;
12498 } catch ( err ) {
12599 return err && err instanceof AlertTriggeredError ;
126100 }
@@ -139,12 +113,11 @@ describe('prover node recovery', () => {
139113 values : { 'global.chaosResourceNamespace' : config . NAMESPACE } ,
140114 } ) ;
141115
142- // wait for the node to start proving again and
143- // validate it hits the cache
116+ // Wait for the node to start proving again and validate it hits the cache
144117 const result = await retryUntil (
145118 async ( ) => {
146119 try {
147- await alertChecker . runAlertCheck ( [ cachedProvingJobs ] ) ;
120+ await runAlertCheck ( [ cachedProvingJobs ] ) ;
148121 } catch ( err ) {
149122 if ( err && err instanceof AlertTriggeredError ) {
150123 return true ;
@@ -163,10 +136,11 @@ describe('prover node recovery', () => {
163136 it ( 'should recover after a broker crash' , async ( ) => {
164137 logger . info ( `Waiting for epoch proving job to start` ) ;
165138
139+ // First, wait for proving to be active
166140 await retryUntil (
167141 async ( ) => {
168142 try {
169- await alertChecker . runAlertCheck ( [ enqueuedBlockRollupJobs ] ) ;
143+ await runAlertCheck ( [ enqueuedBlockRollupJobs ] ) ;
170144 } catch ( err ) {
171145 return err && err instanceof AlertTriggeredError ;
172146 }
@@ -176,7 +150,11 @@ describe('prover node recovery', () => {
176150 5 ,
177151 ) ;
178152
179- logger . info ( `Detected epoch proving job. Killing the broker` ) ;
153+ logger . info ( `Detected epoch proving job. Waiting for proven block to advance...` ) ;
154+
155+ await waitForProvenToAdvance ( rpcEndpoint . url , logger , epochDurationSeconds * 3 , slotDurationSeconds ) ;
156+
157+ logger . info ( `Proven block advanced. Killing the broker` ) ;
180158
181159 await applyProverBrokerKill ( {
182160 namespace : config . NAMESPACE ,
@@ -185,16 +163,16 @@ describe('prover node recovery', () => {
185163 values : { 'global.chaosResourceNamespace' : config . NAMESPACE } ,
186164 } ) ;
187165
166+ // Wait for the broker to recover and proving to resume
188167 const result = await retryUntil (
189168 async ( ) => {
190169 try {
191- await alertChecker . runAlertCheck ( [ enqueuedRootRollupJobs ] ) ;
170+ await runAlertCheck ( [ enqueuedRootRollupJobs ] ) ;
192171 } catch ( err ) {
193172 if ( err && err instanceof AlertTriggeredError ) {
194173 return true ;
195174 }
196175 }
197-
198176 return false ;
199177 } ,
200178 'wait for root rollup' ,
0 commit comments