Skip to content

Commit 05e647b

Browse files
Added cluster reaper
[ci] Signed-off-by: Pasindu Tennage <pasindu.tennage@digitalasset.com>
1 parent e670ba6 commit 05e647b

File tree

8 files changed

+206
-9
lines changed

8 files changed

+206
-9
lines changed

cluster/images/splice-debug/Dockerfile

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,15 @@
66
FROM --platform=linux/amd64 ubuntu:24.04@sha256:440dcf6a5640b2ae5c77724e68787a906afb8ddee98bf86db94eea8528c2c076
77
LABEL org.opencontainers.image.base.name="ubuntu:24.04"
88

9-
RUN apt-get update && apt-get install -y postgresql-client curl netcat-openbsd
9+
ARG KUBECTL_VERSION
10+
11+
RUN apt-get update && apt-get install -y \
12+
postgresql-client \
13+
curl \
14+
netcat-openbsd \
15+
jq
16+
17+
RUN curl -fL -LO https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl && install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
1018
RUN curl -sSLO https://github.com/fullstorydev/grpcurl/releases/download/v1.9.2/grpcurl_1.9.2_linux_amd64.deb && dpkg -i grpcurl_1.9.2_linux_amd64.deb && rm grpcurl_1.9.2_linux_amd64.deb
1119

1220
COPY target/LICENSE .

cluster/images/splice-debug/local.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,6 @@
44
dir := $(call current_dir)
55

66
$(dir)/$(docker-build): $(dir)/target/LICENSE
7-
7+
$(dir)/$(docker-build): build_arg := --build-arg KUBECTL_VERSION=v${KUBECTL_VERSION}
88
$(dir)/target/LICENSE: ${SPLICE_ROOT}/cluster/images/LICENSE
99
cp $< $@
Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,7 @@
22
// SPDX-License-Identifier: Apache-2.0
33
import { activeVersion } from '@lfdecentralizedtrust/splice-pulumi-common';
44

5-
export const Version = versionFromDefault();
6-
7-
function versionFromDefault() {
5+
export function versionFromDefault(): string {
86
if (activeVersion.type == 'remote') {
97
return activeVersion.version;
108
} else {

cluster/pulumi/infra/src/config.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ export const InfraConfigSchema = z.object({
9393
extraWhitelistedIngress: z.array(z.string()).default([]),
9494
})
9595
.optional(),
96+
enableGCReaperJob: z.boolean().default(false),
9697
prometheus: z.object({
9798
storageSize: z.string(),
9899
retentionDuration: z.string(),
@@ -117,7 +118,7 @@ export type Config = z.infer<typeof InfraConfigSchema>;
117118
// eslint-disable-next-line
118119
// @ts-ignore
119120
const fullConfig = InfraConfigSchema.parse(clusterYamlConfig);
120-
121+
export const enableGCReaperJob = fullConfig.infra.enableGCReaperJob;
121122
console.error(
122123
`Loaded infra config: ${util.inspect(fullConfig, {
123124
depth: null,

cluster/pulumi/infra/src/index.ts

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,13 @@ import { config } from '@lfdecentralizedtrust/splice-pulumi-common';
66
import { clusterIsResetPeriodically, enableAlerts } from './alertings';
77
import { configureAuth0 } from './auth0';
88
import { configureCloudArmorPolicy } from './cloudArmor';
9-
import { cloudArmorConfig, clusterBaseDomain, clusterBasename, monitoringConfig } from './config';
9+
import {
10+
cloudArmorConfig,
11+
clusterBaseDomain,
12+
clusterBasename,
13+
enableGCReaperJob,
14+
monitoringConfig,
15+
} from './config';
1016
import { installExtraCustomResources } from './extraCustomResources';
1117
import {
1218
getNotificationChannel,
@@ -15,6 +21,7 @@ import {
1521
installClusterMaintenanceUpdateAlerts,
1622
} from './gcpAlerts';
1723
import { configureIstio, istioMonitoring } from './istio';
24+
import { deployGCPodReaper } from './maintenance';
1825
import { configureNetwork } from './network';
1926
import { configureObservability } from './observability';
2027
import { configureStorage } from './storage';
@@ -48,6 +55,10 @@ configureCloudArmorPolicy(cloudArmorConfig);
4855

4956
installExtraCustomResources();
5057

58+
if (enableGCReaperJob) {
59+
deployGCPodReaper('cluster-pod-gc-reaper', ['multi-validator'], { parent: network.ingressNs.ns });
60+
}
61+
5162
let configuredAuth0;
5263
if (config.envFlag('CLUSTER_CONFIGURE_AUTH0', true)) {
5364
configuredAuth0 = configureAuth0(clusterBasename, network.dnsNames);
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
// Copyright (c) 2024 Digital Asset (Switzerland) GmbH and/or its affiliates. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
import * as k8s from '@pulumi/kubernetes';
4+
import * as pulumi from '@pulumi/pulumi';
5+
import { versionFromDefault } from '@lfdecentralizedtrust/splice-pulumi-common/src/version';
6+
7+
import { DOCKER_REPO, infraAffinityAndTolerations } from '../../common';
8+
9+
const cronJobName = 'gc-pod-reaper-job';
10+
const reaperNamespace = 'gc-pod-reaper';
11+
const serviceAccountName = 'gc-pod-reaper-service-account';
12+
const schedule = '0 * * * *'; // Run once every hour, at minute 0
13+
14+
const deleteBadPodsCommand = [
15+
'/bin/bash',
16+
'-c',
17+
`
18+
echo "--- $(date) Starting Pod Reaper ---";
19+
TARGET_NAMESPACES_LIST=$(echo "$TARGET_NAMESPACES" | tr ',' ' ');
20+
echo "Targeting namespaces: $TARGET_NAMESPACES_LIST";
21+
if [ -z "$TARGET_NAMESPACES_LIST" ]; then
22+
echo "Error: No target namespaces provided. Exiting.";
23+
exit 1
24+
fi
25+
echo "--- Starting Cleanup Loop ---";
26+
for NAMESPACE in $TARGET_NAMESPACES_LIST; do
27+
echo "Processing namespace: $NAMESPACE";
28+
echo "Listing all pods in $NAMESPACE:";
29+
kubectl get pods -n "$NAMESPACE";
30+
echo "--- checking PODS with bad status in $NAMESPACE ---";
31+
BAD_PODS=$(
32+
kubectl get pods -n "$NAMESPACE" -o json | \\
33+
jq -r '.items[] |
34+
(select(.status.phase == "Unknown" and .status.reason == "ContainerStatusUnknown") |
35+
.metadata.name) //
36+
(select(.status.containerStatuses[]?.state.terminated? |
37+
.reason == "Error" and .exitCode == 137) |
38+
.metadata.name)'
39+
);
40+
if [ -z "$BAD_PODS" ]; then
41+
echo "No bad pods found in $NAMESPACE. Skipping.";
42+
continue
43+
fi
44+
echo "Found bad pods in $NAMESPACE: $BAD_PODS";
45+
for POD_NAME in $BAD_PODS; do
46+
echo "Attempting to delete pod $NAMESPACE/$POD_NAME";
47+
kubectl delete pod -n "$NAMESPACE" "$POD_NAME"
48+
if [ $? -eq 0 ]; then
49+
echo "Successfully deleted $NAMESPACE/$POD_NAME";
50+
else
51+
echo "Failed to delete $NAMESPACE/$POD_NAME";
52+
fi
53+
done
54+
echo "Finished cleanup for $NAMESPACE.";
55+
echo "---"
56+
done
57+
echo "--- $(date) Pod Reaper Finished ---";
58+
true
59+
`,
60+
];
61+
62+
export function deployGCPodReaper(
63+
name: string,
64+
targetNamespaces: string[],
65+
opts?: pulumi.ComponentResourceOptions
66+
): k8s.batch.v1.CronJob {
67+
const ns = new k8s.core.v1.Namespace(name, {
68+
metadata: {
69+
name: reaperNamespace,
70+
labels: {
71+
'app.kubernetes.io/name': reaperNamespace,
72+
},
73+
},
74+
});
75+
76+
targetNamespaces.forEach(namespace => {
77+
const podManagementRole = new k8s.rbac.v1.Role(
78+
namespace + '-gc-pod-reaper-role',
79+
{
80+
metadata: {
81+
name: namespace + '-gc-pod-reaper-role',
82+
namespace: namespace,
83+
},
84+
rules: [
85+
{
86+
apiGroups: [''], // Core API group for Pods
87+
resources: ['pods'],
88+
verbs: ['list', 'create', 'delete', 'update'],
89+
},
90+
],
91+
},
92+
{ parent: ns }
93+
);
94+
95+
new k8s.rbac.v1.RoleBinding(
96+
namespace + '-gc-pod-reaper-pod-manager-binding',
97+
{
98+
metadata: {
99+
name: namespace + 'gc-pod-reaper-pod-manager-binding',
100+
namespace: namespace,
101+
},
102+
subjects: [
103+
{
104+
kind: 'ServiceAccount',
105+
name: serviceAccountName,
106+
namespace: 'gc-pod-reaper',
107+
},
108+
],
109+
roleRef: {
110+
kind: 'Role',
111+
name: podManagementRole.metadata.name,
112+
apiGroup: 'rbac.authorization.k8s.io',
113+
},
114+
},
115+
{ parent: podManagementRole, dependsOn: [podManagementRole] }
116+
);
117+
});
118+
119+
const targetNamespacesEnv = targetNamespaces.join(',');
120+
121+
new k8s.core.v1.ServiceAccount(
122+
serviceAccountName,
123+
{
124+
metadata: {
125+
name: serviceAccountName,
126+
namespace: ns.metadata.name,
127+
},
128+
},
129+
{ parent: ns }
130+
);
131+
132+
return new k8s.batch.v1.CronJob(
133+
cronJobName,
134+
{
135+
metadata: {
136+
name: cronJobName,
137+
namespace: ns.metadata.name,
138+
},
139+
spec: {
140+
schedule: schedule,
141+
concurrencyPolicy: 'Forbid',
142+
successfulJobsHistoryLimit: 2,
143+
failedJobsHistoryLimit: 2,
144+
jobTemplate: {
145+
metadata: {
146+
labels: {
147+
app: 'gc-pod-reaper',
148+
},
149+
},
150+
spec: {
151+
template: {
152+
spec: {
153+
serviceAccountName: serviceAccountName,
154+
restartPolicy: 'OnFailure',
155+
...infraAffinityAndTolerations,
156+
containers: [
157+
{
158+
name: cronJobName,
159+
image: `${DOCKER_REPO}/splice-debug:${versionFromDefault()}`,
160+
imagePullPolicy: 'Always',
161+
command: deleteBadPodsCommand,
162+
env: [
163+
{
164+
name: 'TARGET_NAMESPACES',
165+
value: targetNamespacesEnv,
166+
},
167+
],
168+
},
169+
],
170+
},
171+
},
172+
},
173+
},
174+
},
175+
},
176+
{ parent: ns, ...opts }
177+
);
178+
}

cluster/pulumi/multi-validator/src/multiNodeDeployment.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ import {
1010
numNodesPerInstance,
1111
} from '@lfdecentralizedtrust/splice-pulumi-common';
1212
import { ServiceMonitor } from '@lfdecentralizedtrust/splice-pulumi-common/src/metrics';
13+
import { versionFromDefault } from '@lfdecentralizedtrust/splice-pulumi-common/src/version';
1314
import _ from 'lodash';
1415

15-
import { Version } from '../version';
1616
import { EnvironmentVariable, multiValidatorConfig } from './config';
1717

1818
export interface BaseMultiNodeArgs {
@@ -89,7 +89,7 @@ export class MultiNodeDeployment extends pulumi.ComponentResource {
8989
containers: [
9090
{
9191
name: args.imageName,
92-
image: `${DOCKER_REPO}/${args.imageName}:${Version}`,
92+
image: `${DOCKER_REPO}/${args.imageName}:${versionFromDefault()}`,
9393
...imagePullPolicy,
9494
...args.container,
9595
ports: args.container.ports.concat([

nix/shell.nix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,4 +133,5 @@ in pkgs.mkShell {
133133

134134
PULUMI_VERSION="${pkgs.pulumi-bin.version}";
135135
GECKODRIVER="${pkgs.geckodriver}/bin/geckodriver";
136+
KUBECTL_VERSION="${pkgs.kubectl.version}";
136137
}

0 commit comments

Comments
 (0)