Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cluster/expected/infra/expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -1803,7 +1803,7 @@
"id": "",
"inputs": {
"description": "Logs with a severity level of warning or above",
"filter": "severity>=WARNING\nresource.type=\"k8s_container\"\nresource.labels.cluster_name=\"cn-mocknet\"\n-- Note that we ignore the validator runbook. This is because we reset it periodically, which sometimes produces noise.\nresource.labels.namespace_name=~\"sv|validator1|multi-validator|splitwell\"\n-(resource.labels.container_name=~\"participant\" AND jsonPayload.message=~\"Instrument .* has recorded multiple values for the same attributes.\")\n-- https://github.com/DACH-NY/canton-network-node/issues/10475\n-(resource.labels.container_name=\"cometbft\" AND\n ( jsonPayload.err=~\"\\Aerror adding vote\\z|\\Aalready stopped\\z|use of closed network connection\"\n OR jsonPayload._msg=~\"\\A(Stopping peer for error|Stopped accept routine, as transport is closed|Failed to write PacketMsg|Connection failed @ sendRoutine)\\z\"\n OR jsonPayload.error=\"already stopped\"\n OR textPayload=\"cp: not replacing '/cometbft/data/priv_validator_state.json'\"\n OR (jsonPayload._msg=\"Error stopping connection\" AND jsonPayload.err=\"already stopped\")\n OR jsonPayload._msg=\"Error adding peer to new bucket\"))\n-- execution context overload\n-jsonPayload.message=~\"Task runner canton-env-ec is .* overloaded\"\n-- on startup\n-textPayload=~\"Picked up JAVA_TOOL_OPTIONS:\"\n-- \\A and \\z anchor a search (=~) at beginning/end of string, respectively\n-- regex is significantly faster than OR; gcp docs themselves recommend\n-- regex-based factoring\n-resource.labels.container_name=~\"\\A(ans|wallet|scan|sv|splitwell)-web-ui\\z\"\n-- sequencer down\n-(resource.labels.namespace_name=~\"validator|splitwell\"\n AND resource.labels.container_name=~\"participant\"\n AND jsonPayload.message=~\"SEQUENCER_SUBSCRIPTION_LOST|Request failed for sequencer|Sequencer shutting down|Submission timed out|Response message for request .* timed out |periodic acknowledgement failed|Token refresh failed with Status{code=UNAVAILABLE\")\n-(resource.labels.container_name=\"postgres-exporter\" AND jsonPayload.msg=~\"Error loading config|Excluded databases\")\n-jsonPayload.message=~\"UnknownHostException\"\n-(resource.labels.container_name=~\"participant|mediator\" AND jsonPayload.message=~\"Late processing \\(or clock skew\\) of batch\")\n-(resource.labels.container_name=\"sequencer\" AND jsonPayload.stack_trace=~\"UnresolvedAddressException\")\n-(resource.labels.container_name=\"sequencer-pg\" AND\n (\"checkpoints are occurring too frequently\" OR \"Consider increasing the configuration parameter \\\"max_wal_size\\\".\"))\n-(resource.labels.container_name=~\"participant\" AND\n jsonPayload.message=~\"SYNC_SERVICE_ALARM.*Received a request.*where the view.*has (missing|extra) recipients|LOCAL_VERDICT_MALFORMED_PAYLOAD.*Rejected transaction due to malformed payload within views.*WrongRecipients|channel.*shutdown did not complete gracefully in allotted|LOCAL_VERDICT_FAILED_MODEL_CONFORMANCE_CHECK.*: UnvettedPackages\")\n-(resource.labels.container_name=\"mediator\" AND\n jsonPayload.message=~\"MEDIATOR_RECEIVED_MALFORMED_MESSAGE.*(Reason: (Missing root hash message for informee participants|Superfluous root hash message)|Received a (mediator|confirmation) response.*with an invalid root hash)\")\n-(jsonPayload.logger_name=~\"c.d.n.a.AdminAuthExtractor:\" AND jsonPayload.message=~\"Authorization Failed\")\n-(jsonPayload.level=\"error\" AND jsonPayload.msg=~\"/readyz\")\n-- The prometheus export server does not wait for any ongoing requests when shutting down https://github.com/prometheus/client_java/issues/938\n-jsonPayload.message=\"The Prometheus metrics HTTPServer caught an Exception while trying to send the metrics response.\"\n-- istio-proxy is spammy with warnings\n-(resource.labels.container_name=\"istio-proxy\" AND severity<ERROR)\n-resource.labels.container_name=\"postgres\"\n-(resource.labels.container_name=~\"postgres\" AND resource.labels.namespace_name=\"multi-validator\")\n-- TODO(DACH-NY/canton-network-internal#412): Remove this once we have improved our sv onboarding logic\n-(resource.labels.container_name=\"sv-app\" AND jsonPayload.stack_trace=~\"io.grpc.StatusRuntimeException: FAILED_PRECONDITION: UNHANDLED_EXCEPTION.*SV party has not yet operated a node\")\n-- TODO(#695): Don't just ignore this - investigate!\n-(resource.labels.container_name=\"splitwell-app\" AND jsonPayload.message=~\"Waiting for domain Domain 'global' to be connected has not completed after\")\n-- TODO(#911): Our apps can't handle ingesting bursts of transactions after delays due to the record order publisher\n-(jsonPayload.message=~\"signalWhenIngested.* has not completed after .* milliseconds\")\n\n-- TODO(DACH-NY/canton-network-node#17025): Stop ignoring these again once we have topology-aware package selection\n-(jsonPayload.\"span-name\"=\"MergeValidatorLicenseContractsTrigger\" AND (severity=WARNING OR \"has not vetted\"))\n-(jsonPayload.\"error-code\"=~\"ACS_COMMITMENT_MISMATCH\" AND jsonPayload.remote=~\"tw-cn-testnet-participant\")\n\n\n-- TODO(DACH-NY/canton-network-node#19192): suppressed faulty validator warnings until timestamp\n-(resource.labels.container_name=\"participant\"\n AND resource.labels.namespace_name=\"sv-1\"\n AND jsonPayload.message=~\"ACS_COMMITMENT_MISMATCH\"\n AND jsonPayload.remote=~\"sender = PAR::tw-cn-testnet-participant-1::122051b3a160\"\n AND timestamp <= \"2025-05-14T09:00:00.000Z\")\n\n",
"filter": "severity>=WARNING\nresource.type=\"k8s_container\"\nresource.labels.cluster_name=\"cn-mocknet\"\n-- Note that we ignore the validator runbook. This is because we reset it periodically, which sometimes produces noise.\nresource.labels.namespace_name=~\"sv|validator1|multi-validator|splitwell\"\n-(resource.labels.container_name=~\"participant\" AND jsonPayload.message=~\"Instrument .* has recorded multiple values for the same attributes.\")\n-- https://github.com/DACH-NY/canton-network-node/issues/10475\n-(resource.labels.container_name=\"cometbft\" AND\n ( jsonPayload.err=~\"\\Aerror adding vote\\z|\\Aalready stopped\\z|use of closed network connection\"\n OR jsonPayload._msg=~\"\\A(Stopping peer for error|Stopped accept routine, as transport is closed|Failed to write PacketMsg|Connection failed @ sendRoutine)\\z\"\n OR jsonPayload.error=\"already stopped\"\n OR textPayload=\"cp: not replacing '/cometbft/data/priv_validator_state.json'\"\n OR (jsonPayload._msg=\"Error stopping connection\" AND jsonPayload.err=\"already stopped\")\n OR jsonPayload._msg=\"Error adding peer to new bucket\"))\n-- execution context overload\n-jsonPayload.message=~\"Task runner canton-env-ec is .* overloaded\"\n-- on startup\n-textPayload=~\"Picked up JAVA_TOOL_OPTIONS:\"\n-- \\A and \\z anchor a search (=~) at beginning/end of string, respectively\n-- regex is significantly faster than OR; gcp docs themselves recommend\n-- regex-based factoring\n-resource.labels.container_name=~\"\\A(ans|wallet|scan|sv|splitwell)-web-ui\\z\"\n-- sequencer down\n-(resource.labels.namespace_name=~\"validator|splitwell\"\n AND resource.labels.container_name=~\"participant\"\n AND jsonPayload.message=~\"SEQUENCER_SUBSCRIPTION_LOST|Request failed for sequencer|Sequencer shutting down|Submission timed out|Response message for request .* timed out |periodic acknowledgement failed|Token refresh failed with Status{code=UNAVAILABLE\")\n-(resource.labels.container_name=\"postgres-exporter\" AND jsonPayload.msg=~\"Error loading config|Excluded databases\")\n-jsonPayload.message=~\"UnknownHostException\"\n-(resource.labels.container_name=~\"participant|mediator\" AND jsonPayload.message=~\"Late processing \\(or clock skew\\) of batch\")\n-(resource.labels.container_name=\"sequencer\" AND jsonPayload.stack_trace=~\"UnresolvedAddressException\")\n-(resource.labels.container_name=\"sequencer-pg\" AND\n (\"checkpoints are occurring too frequently\" OR \"Consider increasing the configuration parameter \\\"max_wal_size\\\".\"))\n-(resource.labels.container_name=~\"participant\" AND\n jsonPayload.message=~\"SYNC_SERVICE_ALARM.*Received a request.*where the view.*has (missing|extra) recipients|LOCAL_VERDICT_MALFORMED_PAYLOAD.*Rejected transaction due to malformed payload within views.*WrongRecipients|channel.*shutdown did not complete gracefully in allotted|LOCAL_VERDICT_FAILED_MODEL_CONFORMANCE_CHECK.*: UnvettedPackages\")\n-(resource.labels.container_name=\"mediator\" AND\n jsonPayload.message=~\"MEDIATOR_RECEIVED_MALFORMED_MESSAGE.*(Reason: (Missing root hash message for informee participants|Superfluous root hash message)|Received a (mediator|confirmation) response.*with an invalid root hash)\")\n-(jsonPayload.logger_name=~\"c.d.n.a.AdminAuthExtractor:\" AND jsonPayload.message=~\"Authorization Failed\")\n-(jsonPayload.level=\"error\" AND jsonPayload.msg=~\"/readyz\")\n-- The prometheus export server does not wait for any ongoing requests when shutting down https://github.com/prometheus/client_java/issues/938\n-jsonPayload.message=\"The Prometheus metrics HTTPServer caught an Exception while trying to send the metrics response.\"\n-- istio-proxy is spammy with warnings\n-(resource.labels.container_name=\"istio-proxy\" AND severity<ERROR)\n-resource.labels.container_name=\"postgres\"\n-(resource.labels.container_name=~\"postgres\" AND resource.labels.namespace_name=\"multi-validator\")\n-- TODO(DACH-NY/canton-network-internal#412): Remove this once we have improved our sv onboarding logic\n-(resource.labels.container_name=\"sv-app\" AND jsonPayload.stack_trace=~\"io.grpc.StatusRuntimeException: FAILED_PRECONDITION: UNHANDLED_EXCEPTION.*SV party has not yet operated a node\")\n-- TODO(#695): Don't just ignore this - investigate!\n-(resource.labels.container_name=\"splitwell-app\" AND jsonPayload.message=~\"Waiting for domain Domain 'global' to be connected has not completed after\")\n-- TODO(#911): Our apps can't handle ingesting bursts of transactions after delays due to the record order publisher\n-(jsonPayload.message=~\"signalWhenIngested.* has not completed after .* milliseconds\")\n\n-- TODO(DACH-NY/canton-network-node#17025): Stop ignoring these again once we have topology-aware package selection\n-(jsonPayload.\"span-name\"=\"MergeValidatorLicenseContractsTrigger\" AND (severity=WARNING OR \"has not vetted\"))\n-(jsonPayload.\"error-code\"=~\"ACS_COMMITMENT_MISMATCH\" AND jsonPayload.remote=~\"tw-cn-testnet-participant\")\n\n\n-- TODO(DACH-NY/canton-network-node#19192): suppressed faulty validator warnings until timestamp\n-(resource.labels.container_name=\"participant\"\n AND resource.labels.namespace_name=\"sv-1\"\n AND jsonPayload.message=~\"ACS_COMMITMENT_MISMATCH\"\n AND jsonPayload.remote=~\"sender = PAR::tw-cn-testnet-participant-1::122051b3a160\"\n AND timestamp <= \"2025-05-14T09:00:00.000Z\")\n\n\n",
"labelExtractors": {
"cluster": "EXTRACT(resource.labels.cluster_name)",
"namespace": "EXTRACT(resource.labels.namespace_name)"
Expand Down
12 changes: 12 additions & 0 deletions cluster/pulumi/infra/src/gcpAlerts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,18 @@ ${conditionalString(
AND timestamp <= "2025-05-14T09:00:00.000Z")
`
)}
${conditionalString(
isMainNet,
`-- TODO(DACH-NY/cn-test-failures#4768): suppressed faulty validator warnings until timestamp
-(resource.labels.container_name="participant"
AND (
jsonPayload.remote=~"sender = PAR::tw-cn-mainnet-participant-1::1220bc64ba15"
OR jsonPayload.remote=~"sender = PAR::northisland-prod1::12204ef1928f"
OR jsonPayload.remote=~"sender = PAR::Lukka-Inc-prod-2::1220728cfb80"
)
AND timestamp <= "2025-07-14T00:00:00.000Z")
`
)}
${conditionalString(
// making this condition more complicated causes GCP to be unable to parse the query because there's too many filters
isDevNet,
Expand Down
Loading