From b0a000ffa6fde274163b883ac9777e4247c1a699 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oriol=20Mu=C3=B1oz?= Date: Thu, 19 Jun 2025 11:27:46 +0000 Subject: [PATCH 1/4] Ignore trigger cardinality exceeded MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ci] Signed-off-by: Oriol Muñoz --- cluster/pulumi/infra/src/gcpAlerts.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cluster/pulumi/infra/src/gcpAlerts.ts b/cluster/pulumi/infra/src/gcpAlerts.ts index f99be6b845..9711776603 100644 --- a/cluster/pulumi/infra/src/gcpAlerts.ts +++ b/cluster/pulumi/infra/src/gcpAlerts.ts @@ -124,6 +124,12 @@ ${conditionalString( AND jsonPayload.remote=~"sender = PAR::tw-cn-testnet-participant-1::122051b3a160" AND timestamp <= "2025-05-14T09:00:00.000Z") ` +)} +${conditionalString( // making this condition more complicated causes GCP to be unable to parse the query because there's too many filters + isDevNet, + `-- TODO(hyperledger-labs/splice#447): remove this once configured cardinality is respected + -(jsonPayload.message="Instrument splice.trigger.latency.duration.seconds has exceeded the maximum allowed cardinality (1999).") +` )}`, labelExtractors: { cluster: 'EXTRACT(resource.labels.cluster_name)', From e86f1674ae3aa3db7e2c358649286857a408b929 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oriol=20Mu=C3=B1oz?= Date: Thu, 19 Jun 2025 11:29:11 +0000 Subject: [PATCH 2/4] Update expected MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ci] Signed-off-by: Oriol Muñoz --- .idea/runConfigurations/Simple_topology.xml | 4 ++-- cluster/expected/infra/expected.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.idea/runConfigurations/Simple_topology.xml b/.idea/runConfigurations/Simple_topology.xml index d754f3e2fe..c521cda67b 100644 --- a/.idea/runConfigurations/Simple_topology.xml +++ b/.idea/runConfigurations/Simple_topology.xml @@ -2,7 +2,7 @@ - + \ No newline at end of file diff --git a/cluster/expected/infra/expected.json b/cluster/expected/infra/expected.json index c0506c80a9..f7a2a767a1 100644 --- a/cluster/expected/infra/expected.json +++ b/cluster/expected/infra/expected.json @@ -1983,7 +1983,7 @@ "id": "", "inputs": { "description": "Logs with a severity level of warning or above", - "filter": "severity>=WARNING\nresource.type=\"k8s_container\"\nresource.labels.cluster_name=\"cn-mocknet\"\n-- Note that we ignore the validator runbook. This is because we reset it periodically, which sometimes produces noise.\nresource.labels.namespace_name=~\"sv|validator1|multi-validator|splitwell\"\n-(resource.labels.container_name=~\"participant\" AND jsonPayload.message=~\"Instrument .* has recorded multiple values for the same attributes.\")\n-- https://github.com/DACH-NY/canton-network-node/issues/10475\n-(resource.labels.container_name=\"cometbft\" AND\n ( jsonPayload.err=~\"\\Aerror adding vote\\z|\\Aalready stopped\\z|use of closed network connection\"\n OR jsonPayload._msg=~\"\\A(Stopping peer for error|Stopped accept routine, as transport is closed|Failed to write PacketMsg|Connection failed @ sendRoutine)\\z\"\n OR jsonPayload.error=\"already stopped\"\n OR textPayload=\"cp: not replacing '/cometbft/data/priv_validator_state.json'\"\n OR (jsonPayload._msg=\"Error stopping connection\" AND jsonPayload.err=\"already stopped\")\n OR jsonPayload._msg=\"Error adding peer to new bucket\"))\n-- execution context overload\n-jsonPayload.message=~\"Task runner canton-env-ec is .* overloaded\"\n-- on startup\n-textPayload=~\"Picked up JAVA_TOOL_OPTIONS:\"\n-- \\A and \\z anchor a search (=~) at beginning/end of string, respectively\n-- regex is significantly faster than OR; gcp docs themselves recommend\n-- regex-based factoring\n-resource.labels.container_name=~\"\\A(ans|wallet|scan|sv|splitwell)-web-ui\\z\"\n-- sequencer down\n-(resource.labels.namespace_name=~\"validator|splitwell\"\n AND resource.labels.container_name=~\"participant\"\n AND jsonPayload.message=~\"SEQUENCER_SUBSCRIPTION_LOST|Request failed for sequencer|Sequencer shutting down|Submission timed out|Response message for request .* timed out |periodic acknowledgement failed|Token refresh failed with Status{code=UNAVAILABLE\")\n-(resource.labels.container_name=\"postgres-exporter\" AND jsonPayload.msg=~\"Error loading config|Excluded databases\")\n-jsonPayload.message=~\"UnknownHostException\"\n-(resource.labels.container_name=~\"participant|mediator\" AND jsonPayload.message=~\"Late processing \\(or clock skew\\) of batch\")\n-(resource.labels.container_name=\"sequencer\" AND jsonPayload.stack_trace=~\"UnresolvedAddressException\")\n-(resource.labels.container_name=\"sequencer-pg\" AND\n (\"checkpoints are occurring too frequently\" OR \"Consider increasing the configuration parameter \\\"max_wal_size\\\".\"))\n-(resource.labels.container_name=~\"participant\" AND\n jsonPayload.message=~\"SYNC_SERVICE_ALARM.*Received a request.*where the view.*has (missing|extra) recipients|LOCAL_VERDICT_MALFORMED_PAYLOAD.*Rejected transaction due to malformed payload within views.*WrongRecipients|channel.*shutdown did not complete gracefully in allotted|LOCAL_VERDICT_FAILED_MODEL_CONFORMANCE_CHECK.*: UnvettedPackages\")\n-(resource.labels.container_name=\"mediator\" AND\n jsonPayload.message=~\"MEDIATOR_RECEIVED_MALFORMED_MESSAGE.*(Reason: (Missing root hash message for informee participants|Superfluous root hash message)|Received a (mediator|confirmation) response.*with an invalid root hash)\")\n-(jsonPayload.logger_name=~\"c.d.n.a.AdminAuthExtractor:\" AND jsonPayload.message=~\"Authorization Failed\")\n-(jsonPayload.level=\"error\" AND jsonPayload.msg=~\"/readyz\")\n-- The prometheus export server does not wait for any ongoing requests when shutting down https://github.com/prometheus/client_java/issues/938\n-jsonPayload.message=\"The Prometheus metrics HTTPServer caught an Exception while trying to send the metrics response.\"\n-- istio-proxy is spammy with warnings\n-(resource.labels.container_name=\"istio-proxy\" AND severity=WARNING\nresource.type=\"k8s_container\"\nresource.labels.cluster_name=\"cn-mocknet\"\n-- Note that we ignore the validator runbook. This is because we reset it periodically, which sometimes produces noise.\nresource.labels.namespace_name=~\"sv|validator1|multi-validator|splitwell\"\n-(resource.labels.container_name=~\"participant\" AND jsonPayload.message=~\"Instrument .* has recorded multiple values for the same attributes.\")\n-- https://github.com/DACH-NY/canton-network-node/issues/10475\n-(resource.labels.container_name=\"cometbft\" AND\n ( jsonPayload.err=~\"\\Aerror adding vote\\z|\\Aalready stopped\\z|use of closed network connection\"\n OR jsonPayload._msg=~\"\\A(Stopping peer for error|Stopped accept routine, as transport is closed|Failed to write PacketMsg|Connection failed @ sendRoutine)\\z\"\n OR jsonPayload.error=\"already stopped\"\n OR textPayload=\"cp: not replacing '/cometbft/data/priv_validator_state.json'\"\n OR (jsonPayload._msg=\"Error stopping connection\" AND jsonPayload.err=\"already stopped\")\n OR jsonPayload._msg=\"Error adding peer to new bucket\"))\n-- execution context overload\n-jsonPayload.message=~\"Task runner canton-env-ec is .* overloaded\"\n-- on startup\n-textPayload=~\"Picked up JAVA_TOOL_OPTIONS:\"\n-- \\A and \\z anchor a search (=~) at beginning/end of string, respectively\n-- regex is significantly faster than OR; gcp docs themselves recommend\n-- regex-based factoring\n-resource.labels.container_name=~\"\\A(ans|wallet|scan|sv|splitwell)-web-ui\\z\"\n-- sequencer down\n-(resource.labels.namespace_name=~\"validator|splitwell\"\n AND resource.labels.container_name=~\"participant\"\n AND jsonPayload.message=~\"SEQUENCER_SUBSCRIPTION_LOST|Request failed for sequencer|Sequencer shutting down|Submission timed out|Response message for request .* timed out |periodic acknowledgement failed|Token refresh failed with Status{code=UNAVAILABLE\")\n-(resource.labels.container_name=\"postgres-exporter\" AND jsonPayload.msg=~\"Error loading config|Excluded databases\")\n-jsonPayload.message=~\"UnknownHostException\"\n-(resource.labels.container_name=~\"participant|mediator\" AND jsonPayload.message=~\"Late processing \\(or clock skew\\) of batch\")\n-(resource.labels.container_name=\"sequencer\" AND jsonPayload.stack_trace=~\"UnresolvedAddressException\")\n-(resource.labels.container_name=\"sequencer-pg\" AND\n (\"checkpoints are occurring too frequently\" OR \"Consider increasing the configuration parameter \\\"max_wal_size\\\".\"))\n-(resource.labels.container_name=~\"participant\" AND\n jsonPayload.message=~\"SYNC_SERVICE_ALARM.*Received a request.*where the view.*has (missing|extra) recipients|LOCAL_VERDICT_MALFORMED_PAYLOAD.*Rejected transaction due to malformed payload within views.*WrongRecipients|channel.*shutdown did not complete gracefully in allotted|LOCAL_VERDICT_FAILED_MODEL_CONFORMANCE_CHECK.*: UnvettedPackages\")\n-(resource.labels.container_name=\"mediator\" AND\n jsonPayload.message=~\"MEDIATOR_RECEIVED_MALFORMED_MESSAGE.*(Reason: (Missing root hash message for informee participants|Superfluous root hash message)|Received a (mediator|confirmation) response.*with an invalid root hash)\")\n-(jsonPayload.logger_name=~\"c.d.n.a.AdminAuthExtractor:\" AND jsonPayload.message=~\"Authorization Failed\")\n-(jsonPayload.level=\"error\" AND jsonPayload.msg=~\"/readyz\")\n-- The prometheus export server does not wait for any ongoing requests when shutting down https://github.com/prometheus/client_java/issues/938\n-jsonPayload.message=\"The Prometheus metrics HTTPServer caught an Exception while trying to send the metrics response.\"\n-- istio-proxy is spammy with warnings\n-(resource.labels.container_name=\"istio-proxy\" AND severity Date: Thu, 19 Jun 2025 11:30:58 +0000 Subject: [PATCH 3/4] ???? [ci] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Oriol Muñoz --- .idea/runConfigurations/Simple_topology.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.idea/runConfigurations/Simple_topology.xml b/.idea/runConfigurations/Simple_topology.xml index c521cda67b..d754f3e2fe 100644 --- a/.idea/runConfigurations/Simple_topology.xml +++ b/.idea/runConfigurations/Simple_topology.xml @@ -2,7 +2,7 @@ - \ No newline at end of file + From b8dac8f026cfaab204d739eb1031fd9e98daa27c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oriol=20Mu=C3=B1oz?= Date: Thu, 19 Jun 2025 11:58:28 +0000 Subject: [PATCH 4/4] [static] whatever makes you happy, prettier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Oriol Muñoz --- cluster/pulumi/infra/src/gcpAlerts.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cluster/pulumi/infra/src/gcpAlerts.ts b/cluster/pulumi/infra/src/gcpAlerts.ts index 9711776603..150fd6a871 100644 --- a/cluster/pulumi/infra/src/gcpAlerts.ts +++ b/cluster/pulumi/infra/src/gcpAlerts.ts @@ -125,7 +125,8 @@ ${conditionalString( AND timestamp <= "2025-05-14T09:00:00.000Z") ` )} -${conditionalString( // making this condition more complicated causes GCP to be unable to parse the query because there's too many filters +${conditionalString( + // making this condition more complicated causes GCP to be unable to parse the query because there's too many filters isDevNet, `-- TODO(hyperledger-labs/splice#447): remove this once configured cardinality is respected -(jsonPayload.message="Instrument splice.trigger.latency.duration.seconds has exceeded the maximum allowed cardinality (1999).")