Skip to content

Commit df8da1b

Browse files
Cluster Reaper job for deleting bad pods (#3414)
[ci] Signed-off-by: Pasindu Tennage <[email protected]>
1 parent a1ffd04 commit df8da1b

File tree

8 files changed

+216
-5
lines changed

8 files changed

+216
-5
lines changed

cluster/images/splice-debug/Dockerfile

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,16 @@
55
# so we're hard-coding amd64, and accepting that this will break on arm64.
66
FROM --platform=linux/amd64 ubuntu:24.04@sha256:440dcf6a5640b2ae5c77724e68787a906afb8ddee98bf86db94eea8528c2c076
77
LABEL org.opencontainers.image.base.name="ubuntu:24.04"
8+
ARG KUBECTL_VERSION
9+
10+
RUN apt-get update && apt-get install -y \
11+
postgresql-client \
12+
curl \
13+
netcat-openbsd \
14+
jq
15+
16+
RUN curl -fL -LO https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl && install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
817

9-
RUN apt-get update && apt-get install -y postgresql-client curl netcat-openbsd
1018
RUN curl -sSLO https://github.com/fullstorydev/grpcurl/releases/download/v1.9.2/grpcurl_1.9.2_linux_amd64.deb && dpkg -i grpcurl_1.9.2_linux_amd64.deb && rm grpcurl_1.9.2_linux_amd64.deb
1119

1220
COPY target/LICENSE .

cluster/images/splice-debug/local.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,6 @@
44
dir := $(call current_dir)
55

66
$(dir)/$(docker-build): $(dir)/target/LICENSE
7-
7+
$(dir)/$(docker-build): build_arg := --build-arg KUBECTL_VERSION=v${KUBECTL_VERSION}
88
$(dir)/target/LICENSE: ${SPLICE_ROOT}/cluster/images/LICENSE
99
cp $< $@

cluster/pulumi/infra/src/config.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ export const InfraConfigSchema = z.object({
9393
extraWhitelistedIngress: z.array(z.string()).default([]),
9494
})
9595
.optional(),
96+
enableGCReaperJob: z.boolean().default(false),
9697
prometheus: z.object({
9798
storageSize: z.string(),
9899
retentionDuration: z.string(),
@@ -117,7 +118,7 @@ export type Config = z.infer<typeof InfraConfigSchema>;
117118
// eslint-disable-next-line
118119
// @ts-ignore
119120
const fullConfig = InfraConfigSchema.parse(clusterYamlConfig);
120-
121+
export const enableGCReaperJob = fullConfig.infra.enableGCReaperJob;
121122
console.error(
122123
`Loaded infra config: ${util.inspect(fullConfig, {
123124
depth: null,

cluster/pulumi/infra/src/index.ts

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,13 @@ import { config } from '@lfdecentralizedtrust/splice-pulumi-common';
66
import { clusterIsResetPeriodically, enableAlerts } from './alertings';
77
import { configureAuth0 } from './auth0';
88
import { configureCloudArmorPolicy } from './cloudArmor';
9-
import { cloudArmorConfig, clusterBaseDomain, clusterBasename, monitoringConfig } from './config';
9+
import {
10+
cloudArmorConfig,
11+
clusterBaseDomain,
12+
clusterBasename,
13+
enableGCReaperJob,
14+
monitoringConfig,
15+
} from './config';
1016
import { installExtraCustomResources } from './extraCustomResources';
1117
import {
1218
getNotificationChannel,
@@ -15,6 +21,7 @@ import {
1521
installClusterMaintenanceUpdateAlerts,
1622
} from './gcpAlerts';
1723
import { configureIstio, istioMonitoring } from './istio';
24+
import { deployGCPodReaper } from './maintenance';
1825
import { configureNetwork } from './network';
1926
import { configureObservability } from './observability';
2027
import { configureStorage } from './storage';
@@ -48,6 +55,10 @@ configureCloudArmorPolicy(cloudArmorConfig);
4855

4956
installExtraCustomResources();
5057

58+
if (enableGCReaperJob) {
59+
deployGCPodReaper('cluster-pod-gc-reaper', ['multi-validator'], { parent: network.ingressNs.ns });
60+
}
61+
5162
let configuredAuth0;
5263
if (config.envFlag('CLUSTER_CONFIGURE_AUTH0', true)) {
5364
configuredAuth0 = configureAuth0(clusterBasename, network.dnsNames);
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
// Copyright (c) 2024 Digital Asset (Switzerland) GmbH and/or its affiliates. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
import * as k8s from '@pulumi/kubernetes';
4+
import * as pulumi from '@pulumi/pulumi';
5+
import { Version } from '@lfdecentralizedtrust/splice-pulumi-common/src/version';
6+
7+
import { DOCKER_REPO, infraAffinityAndTolerations } from '../../common';
8+
9+
const cronJobName = 'gc-pod-reaper-job';
10+
const reaperNamespace = 'gc-pod-reaper';
11+
const serviceAccountName = 'gc-pod-reaper-service-account';
12+
const schedule = '0 * * * *'; // Run once every hour, at minute 0
13+
14+
const deleteBadPodsCommand = [
15+
'/bin/bash',
16+
'-c',
17+
`
18+
echo "--- $(date) Starting Pod Reaper ---";
19+
20+
TARGET_NAMESPACES_LIST=$(echo "$TARGET_NAMESPACES" | tr ',' ' ');
21+
echo "Targeting namespaces: $TARGET_NAMESPACES_LIST";
22+
23+
if [ -z "$TARGET_NAMESPACES_LIST" ]; then
24+
echo "Error: No target namespaces provided. Exiting.";
25+
exit 1
26+
fi
27+
28+
echo "--- Starting Cleanup Loop ---";
29+
30+
for NAMESPACE in $TARGET_NAMESPACES_LIST; do
31+
echo "Processing namespace: $NAMESPACE";
32+
33+
echo "Listing all pods in $NAMESPACE:";
34+
kubectl get pods -n "$NAMESPACE";
35+
36+
echo "--- checking PODS with bad status in $NAMESPACE ---";
37+
38+
BAD_PODS=$(
39+
kubectl get pods -n "$NAMESPACE" -o json | \\
40+
jq -r '.items[] |
41+
(select(.status.phase == "Unknown" and .status.reason == "ContainerStatusUnknown") |
42+
.metadata.name) //
43+
44+
(select(.status.containerStatuses[]?.state.terminated? |
45+
.reason == "Error" and .exitCode == 137) |
46+
.metadata.name)'
47+
);
48+
49+
if [ -z "$BAD_PODS" ]; then
50+
echo "No bad pods found in $NAMESPACE. Skipping.";
51+
continue
52+
fi
53+
54+
echo "Found bad pods in $NAMESPACE: $BAD_PODS";
55+
56+
for POD_NAME in $BAD_PODS; do
57+
echo "Attempting to delete pod $NAMESPACE/$POD_NAME";
58+
kubectl delete pod -n "$NAMESPACE" "$POD_NAME"
59+
if [ $? -eq 0 ]; then
60+
echo "Successfully deleted $NAMESPACE/$POD_NAME";
61+
else
62+
echo "Failed to delete $NAMESPACE/$POD_NAME";
63+
fi
64+
done
65+
echo "Finished cleanup for $NAMESPACE.";
66+
echo "---"
67+
done
68+
69+
echo "--- $(date) Pod Reaper Finished ---";
70+
true
71+
`,
72+
];
73+
74+
export function deployGCPodReaper(
75+
name: string,
76+
targetNamespaces: string[],
77+
opts?: pulumi.ComponentResourceOptions
78+
): k8s.batch.v1.CronJob {
79+
const ns = new k8s.core.v1.Namespace(name, {
80+
metadata: {
81+
name: reaperNamespace,
82+
labels: {
83+
'app.kubernetes.io/name': reaperNamespace,
84+
},
85+
},
86+
});
87+
88+
targetNamespaces.forEach(namespace => {
89+
const podManagementRole = new k8s.rbac.v1.Role(
90+
namespace + '-gc-pod-reaper-role',
91+
{
92+
metadata: {
93+
name: namespace + '-gc-pod-reaper-role',
94+
namespace: namespace,
95+
},
96+
rules: [
97+
{
98+
apiGroups: [''], // Core API group for Pods
99+
resources: ['pods'],
100+
verbs: ['list', 'create', 'delete', 'update'],
101+
},
102+
],
103+
},
104+
{ parent: ns }
105+
);
106+
107+
new k8s.rbac.v1.RoleBinding(
108+
namespace + '-gc-pod-reaper-pod-manager-binding',
109+
{
110+
metadata: {
111+
name: namespace + 'gc-pod-reaper-pod-manager-binding',
112+
namespace: namespace,
113+
},
114+
subjects: [
115+
{
116+
kind: 'ServiceAccount',
117+
name: serviceAccountName,
118+
namespace: 'gc-pod-reaper',
119+
},
120+
],
121+
roleRef: {
122+
kind: 'Role',
123+
name: podManagementRole.metadata.name,
124+
apiGroup: 'rbac.authorization.k8s.io',
125+
},
126+
},
127+
{ parent: podManagementRole, dependsOn: [podManagementRole] }
128+
);
129+
});
130+
131+
const targetNamespacesEnv = targetNamespaces.join(',');
132+
133+
new k8s.core.v1.ServiceAccount(
134+
serviceAccountName,
135+
{
136+
metadata: {
137+
name: serviceAccountName,
138+
namespace: ns.metadata.name,
139+
},
140+
},
141+
{ parent: ns }
142+
);
143+
144+
return new k8s.batch.v1.CronJob(
145+
cronJobName,
146+
{
147+
metadata: {
148+
name: cronJobName,
149+
namespace: ns.metadata.name,
150+
},
151+
spec: {
152+
schedule: schedule,
153+
concurrencyPolicy: 'Forbid',
154+
successfulJobsHistoryLimit: 2,
155+
failedJobsHistoryLimit: 2,
156+
jobTemplate: {
157+
metadata: {
158+
labels: {
159+
app: 'gc-pod-reaper',
160+
},
161+
},
162+
spec: {
163+
template: {
164+
spec: {
165+
serviceAccountName: serviceAccountName,
166+
restartPolicy: 'OnFailure',
167+
...infraAffinityAndTolerations,
168+
containers: [
169+
{
170+
name: cronJobName,
171+
image: `${DOCKER_REPO}/splice-debug:${Version}`,
172+
imagePullPolicy: 'Always',
173+
command: deleteBadPodsCommand,
174+
env: [
175+
{
176+
name: 'TARGET_NAMESPACES',
177+
value: targetNamespacesEnv,
178+
},
179+
],
180+
},
181+
],
182+
},
183+
},
184+
},
185+
},
186+
},
187+
},
188+
{ parent: ns, ...opts }
189+
);
190+
}

cluster/pulumi/multi-validator/src/multiNodeDeployment.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ import {
1010
numNodesPerInstance,
1111
} from '@lfdecentralizedtrust/splice-pulumi-common';
1212
import { ServiceMonitor } from '@lfdecentralizedtrust/splice-pulumi-common/src/metrics';
13+
import { Version } from '@lfdecentralizedtrust/splice-pulumi-common/src/version';
1314
import _ from 'lodash';
1415

15-
import { Version } from '../version';
1616
import { EnvironmentVariable, multiValidatorConfig } from './config';
1717

1818
export interface BaseMultiNodeArgs {

nix/shell.nix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,4 +133,5 @@ in pkgs.mkShell {
133133

134134
PULUMI_VERSION="${pkgs.pulumi-bin.version}";
135135
GECKODRIVER="${pkgs.geckodriver}/bin/geckodriver";
136+
KUBECTL_VERSION="${pkgs.kubectl.version}";
136137
}

0 commit comments

Comments
 (0)