Skip to content

Commit d31f816

Browse files
author
danielntmd
committed
chore: scenario network test fixes
- explicitly set timeout to be less than 6 hours so that workflow can complete network teardown - when rolling pods, wait for every state to progress before continuing, wait for PVC to be completely wiped, and roll HA db first and initialize the DB, this will reduce k8 provisioning flakes - reestablish prometheus port forwards on failure for prover-node test and wait for proven chain to update before killing prover broker to make the test more consistent - add component label to HA postgres DB
1 parent 8b8f888 commit d31f816

File tree

11 files changed

+428
-79
lines changed

11 files changed

+428
-79
lines changed

.github/workflows/ci3.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ jobs:
134134
ref: ${{ github.event.pull_request.head.sha || github.sha }}
135135

136136
- name: Run Network Scenarios
137+
timeout-minutes: 350
137138
env:
138139
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
139140
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}

.github/workflows/test-network-scenarios.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,12 @@ concurrency:
3232
jobs:
3333
deploy-and-test:
3434
runs-on: ubuntu-latest
35-
timeout-minutes: 360
3635
steps:
3736
- name: Checkout
3837
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
3938

4039
- name: Run Network Scenarios
40+
timeout-minutes: 350
4141
env:
4242
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
4343
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}

spartan/aztec-postgres/templates/_helpers.tpl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,6 @@
55
{{- define "aztec-postgres.labels" -}}
66
app.kubernetes.io/name: {{ .Chart.Name }}
77
app.kubernetes.io/instance: {{ .Release.Name }}
8+
app.kubernetes.io/component: {{ .Values.component | default .Chart.Name }}
89
app.kubernetes.io/managed-by: {{ .Release.Service }}
910
{{- end }}

spartan/aztec-postgres/templates/statefulset.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ spec:
1616
labels:
1717
app.kubernetes.io/name: {{ .Chart.Name }}
1818
app.kubernetes.io/instance: {{ .Release.Name }}
19+
app.kubernetes.io/component: {{ .Values.component | default .Chart.Name }}
1920
spec:
2021
containers:
2122
- name: postgres
@@ -62,6 +63,10 @@ spec:
6263
volumeClaimTemplates:
6364
- metadata:
6465
name: data
66+
labels:
67+
app.kubernetes.io/name: {{ .Chart.Name }}
68+
app.kubernetes.io/instance: {{ .Release.Name }}
69+
app.kubernetes.io/component: {{ .Values.component | default .Chart.Name }}
6570
spec:
6671
accessModes: ["ReadWriteOnce"]
6772
{{- if .Values.persistence.storageClass }}

spartan/aztec-postgres/values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ image:
33
tag: "18"
44
pullPolicy: IfNotPresent
55

6+
component: "validator-ha-db"
7+
68
auth:
79
database: validator_ha
810
username: validator

spartan/bootstrap.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ _emit_test() { echo "$_test_cmd_prefix $_test_cmd_run src/spartan/$1"; }
7474

7575
function network_test_cmds_1 {
7676
_emit_test smoke.test.ts
77-
_emit_test gating-passive.test.ts
7877
_emit_test reorg.test.ts
7978
_emit_test upgrade_rollup_version.test.ts
8079
_emit_test validator_ha.test.ts
@@ -86,6 +85,7 @@ function network_test_cmds_2 {
8685
_emit_test slash_inactivity.test.ts
8786
_emit_test proving.test.ts
8887
_emit_test prover-node.test.ts
88+
_emit_test gating-passive.test.ts
8989
_emit_test invalidate_blocks.test.ts
9090
_emit_test mempool_limit.test.ts
9191
_emit_test upgrade_governance_proposer.test.ts

yarn-project/end-to-end/src/spartan/prover-node.test.ts

Lines changed: 26 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,26 @@
11
import { createLogger } from '@aztec/foundation/log';
22
import { retryUntil } from '@aztec/foundation/retry';
33

4-
import { AlertTriggeredError, GrafanaClient } from '../quality_of_service/grafana_client.js';
4+
import { AlertTriggeredError } from '../quality_of_service/grafana_client.js';
55
import {
66
ChainHealth,
77
type ServiceEndpoint,
88
applyProverBrokerKill,
99
applyProverKill,
10+
createResilientPrometheusConnection,
1011
deleteResourceByLabel,
1112
getGitProjectRoot,
13+
getRPCEndpoint,
1214
setupEnvironment,
13-
startPortForward,
15+
waitForProvenToAdvance,
1416
} from './utils.js';
1517

1618
const config = setupEnvironment(process.env);
1719

1820
const logger = createLogger('e2e:spartan-test:prover-node');
1921

2022
const epochDurationSeconds = config.AZTEC_EPOCH_DURATION * config.AZTEC_SLOT_DURATION;
23+
const slotDurationSeconds = config.AZTEC_SLOT_DURATION;
2124

2225
/**
2326
* This test aims to check that a prover node is able to recover after a crash.
@@ -55,49 +58,20 @@ const enqueuedRootRollupJobs = {
5558

5659
describe('prover node recovery', () => {
5760
const endpoints: ServiceEndpoint[] = [];
58-
let alertChecker: GrafanaClient;
61+
let runAlertCheck: ReturnType<typeof createResilientPrometheusConnection>['runAlertCheck'];
5962
let spartanDir: string;
63+
let rpcEndpoint: ServiceEndpoint;
6064
const health = new ChainHealth(config.NAMESPACE, logger);
6165

6266
beforeAll(async () => {
6367
await health.setup();
64-
// Try Prometheus in a dedicated metrics namespace first; if not present, fall back to the network namespace
65-
let promPort = 0;
66-
let promUrl = '';
67-
let promProc: Awaited<ReturnType<typeof startPortForward>>['process'];
68-
{
69-
const result = await startPortForward({
70-
resource: `svc/metrics-prometheus-server`,
71-
namespace: 'metrics',
72-
containerPort: 80,
73-
});
74-
promProc = result.process;
75-
promPort = result.port;
76-
promUrl = `http://127.0.0.1:${promPort}/api/v1`;
77-
if (promPort === 0) {
78-
result.process.kill();
79-
}
80-
}
81-
82-
if (promPort === 0) {
83-
const result = await startPortForward({
84-
resource: `svc/prometheus-server`,
85-
namespace: config.NAMESPACE,
86-
containerPort: 80,
87-
});
88-
promProc = result.process;
89-
promPort = result.port;
90-
promUrl = `http://127.0.0.1:${promPort}/api/v1`;
91-
}
92-
93-
if (!promProc || promPort === 0) {
94-
throw new Error('Unable to port-forward to Prometheus. Ensure the metrics stack is deployed.');
95-
}
96-
97-
endpoints.push({ url: promUrl, process: promProc });
98-
const grafanaEndpoint = promUrl;
99-
const grafanaCredentials = '';
100-
alertChecker = new GrafanaClient(logger, { grafanaEndpoint, grafanaCredentials });
68+
69+
rpcEndpoint = await getRPCEndpoint(config.NAMESPACE);
70+
endpoints.push(rpcEndpoint);
71+
72+
const prometheus = createResilientPrometheusConnection(config.NAMESPACE, endpoints, logger);
73+
await prometheus.connect();
74+
runAlertCheck = prometheus.runAlertCheck;
10175

10276
spartanDir = `${getGitProjectRoot()}/spartan`;
10377
});
@@ -120,7 +94,7 @@ describe('prover node recovery', () => {
12094
await retryUntil(
12195
async () => {
12296
try {
123-
await alertChecker.runAlertCheck([enqueuedBlockRollupJobs]);
97+
await runAlertCheck([enqueuedBlockRollupJobs]);
12498
} catch (err) {
12599
return err && err instanceof AlertTriggeredError;
126100
}
@@ -139,12 +113,11 @@ describe('prover node recovery', () => {
139113
values: { 'global.chaosResourceNamespace': config.NAMESPACE },
140114
});
141115

142-
// wait for the node to start proving again and
143-
// validate it hits the cache
116+
// Wait for the node to start proving again and validate it hits the cache
144117
const result = await retryUntil(
145118
async () => {
146119
try {
147-
await alertChecker.runAlertCheck([cachedProvingJobs]);
120+
await runAlertCheck([cachedProvingJobs]);
148121
} catch (err) {
149122
if (err && err instanceof AlertTriggeredError) {
150123
return true;
@@ -163,10 +136,11 @@ describe('prover node recovery', () => {
163136
it('should recover after a broker crash', async () => {
164137
logger.info(`Waiting for epoch proving job to start`);
165138

139+
// First, wait for proving to be active
166140
await retryUntil(
167141
async () => {
168142
try {
169-
await alertChecker.runAlertCheck([enqueuedBlockRollupJobs]);
143+
await runAlertCheck([enqueuedBlockRollupJobs]);
170144
} catch (err) {
171145
return err && err instanceof AlertTriggeredError;
172146
}
@@ -176,7 +150,11 @@ describe('prover node recovery', () => {
176150
5,
177151
);
178152

179-
logger.info(`Detected epoch proving job. Killing the broker`);
153+
logger.info(`Detected epoch proving job. Waiting for proven block to advance...`);
154+
155+
await waitForProvenToAdvance(rpcEndpoint.url, logger, epochDurationSeconds * 3, slotDurationSeconds);
156+
157+
logger.info(`Proven block advanced. Killing the broker`);
180158

181159
await applyProverBrokerKill({
182160
namespace: config.NAMESPACE,
@@ -185,16 +163,16 @@ describe('prover node recovery', () => {
185163
values: { 'global.chaosResourceNamespace': config.NAMESPACE },
186164
});
187165

166+
// Wait for the broker to recover and proving to resume
188167
const result = await retryUntil(
189168
async () => {
190169
try {
191-
await alertChecker.runAlertCheck([enqueuedRootRollupJobs]);
170+
await runAlertCheck([enqueuedRootRollupJobs]);
192171
} catch (err) {
193172
if (err && err instanceof AlertTriggeredError) {
194173
return true;
195174
}
196175
}
197-
198176
return false;
199177
},
200178
'wait for root rollup',

yarn-project/end-to-end/src/spartan/upgrade_rollup_version.test.ts

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ import { mnemonicToAccount } from 'viem/accounts';
2626

2727
import { MNEMONIC } from '../fixtures/fixtures.js';
2828
import {
29-
ChainHealth,
3029
type ServiceEndpoint,
3130
getEthereumEndpoint,
3231
getRPCEndpoint,
@@ -47,16 +46,13 @@ describe('spartan_upgrade_rollup_version', () => {
4746
let ETHEREUM_HOSTS: string[];
4847
let originalL1ContractAddresses: L1ContractAddresses;
4948
const endpoints: ServiceEndpoint[] = [];
50-
const health = new ChainHealth(config.NAMESPACE, debugLogger);
5149
jest.setTimeout(3 * 60 * 60 * 1000); // Governance flow can take a while
5250

53-
afterAll(async () => {
54-
await health.teardown();
51+
afterAll(() => {
5552
endpoints.forEach(e => e.process?.kill());
5653
});
5754

5855
beforeAll(async () => {
59-
await health.setup();
6056
const rpcEndpoint = await getRPCEndpoint(config.NAMESPACE);
6157
const ethEndpoint = await getEthereumEndpoint(config.NAMESPACE);
6258
endpoints.push(rpcEndpoint, ethEndpoint);

yarn-project/end-to-end/src/spartan/utils/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ export {
2424
getServiceEndpoint,
2525
getRPCEndpoint,
2626
getEthereumEndpoint,
27+
createResilientPrometheusConnection,
2728
} from './k8s.js';
2829

2930
// Chaos Mesh
@@ -45,6 +46,7 @@ export { restartBot, installTransferBot, uninstallTransferBot } from './bot.js';
4546
// Node operations (sequencers, validators, pods)
4647
export {
4748
awaitCheckpointNumber,
49+
waitForProvenToAdvance,
4850
getSequencers,
4951
updateSequencersConfig,
5052
getSequencersConfig,

0 commit comments

Comments
 (0)