Skip to content

Commit 96416a5

Browse files
committed
2 parents c1a9e90 + f8d208d commit 96416a5

File tree

13 files changed

+640
-205
lines changed

13 files changed

+640
-205
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@ tf/**/alpo-*.yaml
66
tf/**/kubeconfig_*
77
tfplan
88
charts
9-
**/talos/*.yaml
9+
**/talos/clusterconfig/
1010
**/talos/*.iso
1111
**/talos/talosconfig
12+
!**/talos/talconfig.yaml
13+
**/talos/talsecret.yaml
1214
**/talos/aurora
1315
**/talos/.DS_Store
1416

clusters/talos-ottawa/.mise.toml

Lines changed: 138 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,4 +57,141 @@ description = "Reset cluster (DESTRUCTIVE)"
5757

5858
[tasks.cleanup-thunderbolt-debug]
5959
run = "task -d bootstrap/talos cleanup-thunderbolt-debug"
60-
description = "Remove Thunderbolt debug DaemonSet"
60+
description = "Remove Thunderbolt debug DaemonSet"
61+
62+
# Additional Talos management tasks (converted from Makefile)
63+
64+
[tasks.disk]
65+
run = """
66+
for node in rei asuka kaji; do
67+
echo "=== Discovered volumes on $node ==="
68+
talosctl -e $node -n $node get discoveredvolumes
69+
done
70+
"""
71+
description = "Show discovered volumes on all nodes"
72+
73+
[tasks.etcd]
74+
run = "talosctl service etcd -n rei -e rei"
75+
description = "Check etcd service status"
76+
77+
[tasks.static]
78+
run = "talosctl get staticpods -n rei -e rei"
79+
description = "List static pods"
80+
81+
[tasks.link]
82+
run = """
83+
for node in rei asuka kaji; do
84+
echo "=== Link status on $node ==="
85+
talosctl get linkstatus -n $node -e $node
86+
done
87+
"""
88+
description = "Show network link status on all nodes"
89+
90+
[tasks.addresses]
91+
run = """
92+
for node in rei asuka kaji; do
93+
echo "=== Addresses on $node ==="
94+
talosctl get addresses -n $node -e $node
95+
done
96+
"""
97+
description = "Show IP addresses on all nodes"
98+
99+
[tasks.service]
100+
run = """
101+
for node in rei asuka kaji; do
102+
echo "=== Services on $node ==="
103+
talosctl service -n $node -e $node
104+
done
105+
"""
106+
description = "List services on all nodes"
107+
108+
[tasks.reboot]
109+
run = """
110+
echo "WARNING: This will reboot nodes! Press Ctrl+C to cancel..."
111+
sleep 5
112+
for node in rei asuka kaji; do
113+
echo "Rebooting $node..."
114+
talosctl -e $node -n $node reboot --debug
115+
echo "Waiting 30 seconds before next node..."
116+
sleep 30
117+
done
118+
"""
119+
description = "Reboot all nodes (DISRUPTIVE)"
120+
121+
[tasks.reset-node]
122+
run = """
123+
echo "WARNING: This will reset a node! Usage: mise run reset-node <nodename>"
124+
echo "Example: mise run reset-node rei"
125+
echo ""
126+
if [ -z "$1" ]; then
127+
echo "Error: Node name required"
128+
exit 1
129+
fi
130+
echo "Resetting node $1 in 5 seconds... Press Ctrl+C to cancel"
131+
sleep 5
132+
talosctl -e $1 -n $1 reset --system-labels-to-wipe EPHEMERAL,STATE --graceful=false --reboot
133+
"""
134+
description = "Reset a single node to maintenance mode (DESTRUCTIVE)"
135+
136+
[tasks.upgrade]
137+
run = """
138+
TALOS_VERSION="${1:-v1.10.6}"
139+
echo "Upgrading cluster to Talos $TALOS_VERSION"
140+
for node in rei asuka kaji; do
141+
echo "Upgrading $node..."
142+
talosctl upgrade --nodes $node --image ghcr.io/siderolabs/installer:${TALOS_VERSION} -e $node --wait --debug
143+
echo "Waiting for node to be ready..."
144+
sleep 60
145+
done
146+
"""
147+
description = "Upgrade Talos version on all nodes"
148+
149+
[tasks.k9s]
150+
run = "k9s --context [email protected]"
151+
description = "Launch k9s with cluster context"
152+
153+
[tasks.iso]
154+
run = """
155+
echo "Generating ISO from talconfig schematic..."
156+
SCHEMATIC_ID=$(talhelper genurl installer --config-file bootstrap/talos/talconfig.yaml | grep -oE 'factory.talos.dev/installer/[^:]+' | cut -d'/' -f3)
157+
TALOS_VERSION=$(grep talosVersion bootstrap/talos/talconfig.yaml | awk '{print $2}')
158+
echo "Schematic ID: $SCHEMATIC_ID"
159+
echo "Talos Version: $TALOS_VERSION"
160+
echo "Downloading ISO..."
161+
curl -O "https://factory.talos.dev/image/${SCHEMATIC_ID}/${TALOS_VERSION}/metal-amd64.iso"
162+
echo "ISO downloaded: metal-amd64.iso"
163+
"""
164+
description = "Download custom ISO with extensions from talconfig"
165+
166+
[tasks.nodes]
167+
run = "kubectl get nodes -o wide"
168+
description = "Show cluster nodes"
169+
170+
[tasks.pods]
171+
run = "kubectl get pods -A"
172+
description = "Show all pods in cluster"
173+
174+
[tasks.talos-shell]
175+
run = """
176+
NODE="${1:-rei}"
177+
echo "Opening shell on node: $NODE"
178+
talosctl -n $NODE -e $NODE shell
179+
"""
180+
description = "Open shell on a node (default: rei, or specify: mise run talos-shell asuka)"
181+
182+
[tasks.logs]
183+
run = """
184+
SERVICE="${1:-kubelet}"
185+
NODE="${2:-rei}"
186+
echo "Showing logs for $SERVICE on $NODE"
187+
talosctl -n $NODE -e $NODE logs $SERVICE
188+
"""
189+
description = "Show service logs (usage: mise run logs [service] [node])"
190+
191+
[tasks.dmesg]
192+
run = """
193+
NODE="${1:-rei}"
194+
echo "Showing kernel logs on $NODE"
195+
talosctl -n $NODE -e $NODE dmesg
196+
"""
197+
description = "Show kernel logs from a node (default: rei)"
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
creation_rules:
2+
- path_regex: .*secret.*\.yaml$
3+
pgp: FAC8E7C3A2BC7DEE58A01C5928E1AB8AF0CF07A5
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
---
2+
# yaml-language-server: $schema=https://taskfile.dev/schema.json
3+
version: "3"
4+
5+
tasks:
6+
init:
7+
desc: Generate and encrypt Talos secrets
8+
cmds:
9+
- |
10+
if [ ! -f "talsecret.sops.yaml" ]; then
11+
echo "Generating new Talos secrets..."
12+
talhelper gensecret > talsecret.yaml
13+
echo "Encrypting secrets with PGP key..."
14+
sops --encrypt --pgp FAC8E7C3A2BC7DEE58A01C5928E1AB8AF0CF07A5 talsecret.yaml > talsecret.sops.yaml
15+
rm talsecret.yaml
16+
echo "✅ Secrets generated and encrypted"
17+
else
18+
echo "Secrets file already exists"
19+
fi
20+
21+
genconfig:
22+
desc: Generate Talos configuration files
23+
cmds:
24+
- talhelper genconfig
25+
preconditions:
26+
- sh: test -f talconfig.yaml
27+
msg: Missing talconfig.yaml
28+
- sh: test -f talsecret.sops.yaml
29+
msg: Missing talsecret.sops.yaml - run 'mise run init' first
30+
31+
apply:
32+
desc: Apply Talos configuration to nodes
33+
cmds:
34+
- talhelper gencommand apply | bash
35+
preconditions:
36+
- sh: test -d clusterconfig
37+
msg: Missing clusterconfig directory - run 'mise run genconfig' first
38+
39+
apply-insecure:
40+
desc: Apply Talos configuration to nodes in maintenance mode (initial install)
41+
cmds:
42+
- talhelper gencommand apply --extra-flags="--insecure" | bash
43+
preconditions:
44+
- sh: test -d clusterconfig
45+
msg: Missing clusterconfig directory - run 'mise run genconfig' first
46+
47+
bootstrap:
48+
desc: Bootstrap the etcd cluster on first node
49+
cmds:
50+
- talhelper gencommand bootstrap | bash
51+
preconditions:
52+
- sh: test -d clusterconfig
53+
msg: Missing clusterconfig directory - run 'mise run genconfig' first
54+
55+
kubeconfig:
56+
desc: Fetch kubeconfig from Talos cluster
57+
cmds:
58+
- talhelper gencommand kubeconfig --extra-flags="../../ --force" | bash
59+
- chmod 600 ../../kubeconfig
60+
preconditions:
61+
- sh: test -d clusterconfig
62+
msg: Missing clusterconfig directory
63+
64+
test-thunderbolt:
65+
desc: Test Thunderbolt network connectivity
66+
cmds:
67+
- echo "Testing Thunderbolt interfaces..."
68+
- talosctl --nodes rei get links | grep -E "thunderbolt|169.254" || echo "No Thunderbolt on rei"
69+
- talosctl --nodes asuka get links | grep -E "thunderbolt|169.254" || echo "No Thunderbolt on asuka"
70+
- talosctl --nodes kaji get links | grep -E "thunderbolt|169.254" || echo "No Thunderbolt on kaji"
71+
72+
discover-thunderbolt:
73+
desc: Discover Thunderbolt interfaces using kubectl-node-shell (requires running cluster)
74+
cmds:
75+
- |
76+
echo "=== Discovering Thunderbolt interfaces on all nodes ==="
77+
echo "Note: This requires kubectl-node-shell to be installed"
78+
echo ""
79+
for node in rei asuka kaji; do
80+
echo "=== Node: $node ==="
81+
kubectl node-shell $node -- sh -c 'ls -la /sys/bus/thunderbolt/devices/ 2>/dev/null || echo "No Thunderbolt devices found"'
82+
echo ""
83+
echo "Network interfaces with bus paths:"
84+
kubectl node-shell $node -- sh -c 'for iface in $(ls /sys/class/net/); do if [ "$iface" != "lo" ]; then echo -n "$iface: "; readlink /sys/class/net/$iface | grep -oE "[0-9]+-[0-9]+\.[0-9]+" || echo "Not Thunderbolt"; fi; done'
85+
echo ""
86+
done
87+
preconditions:
88+
- sh: kubectl get nodes
89+
msg: Cluster must be running
90+
- sh: which kubectl-node-shell
91+
msg: kubectl-node-shell must be installed (kubectl krew install node-shell)
92+
93+
deploy-thunderbolt-debug:
94+
desc: Deploy privileged DaemonSet for Thunderbolt debugging
95+
cmds:
96+
- kubectl apply -f thunderbolt-debug.yaml
97+
- echo "Waiting for pods to start..."
98+
- kubectl -n kube-system wait --for=condition=Ready pod -l app=thunderbolt-debug --timeout=60s
99+
- echo ""
100+
- echo "=== Thunderbolt Debug Pods Running ==="
101+
- kubectl -n kube-system get pods -l app=thunderbolt-debug -o wide
102+
preconditions:
103+
- sh: test -f thunderbolt-debug.yaml
104+
msg: thunderbolt-debug.yaml not found
105+
- sh: kubectl get nodes
106+
msg: Cluster must be running
107+
108+
cleanup-thunderbolt-debug:
109+
desc: Remove Thunderbolt debug DaemonSet
110+
cmd: kubectl delete -f thunderbolt-debug.yaml --ignore-not-found=true
111+
112+
generate-thunderbolt-patch:
113+
desc: Generate Thunderbolt patch file based on discovered interfaces
114+
cmds:
115+
- |
116+
echo "After discovering Thunderbolt interfaces, create patches/node/NODE-thunderbolt.yaml"
117+
echo "Example patch structure:"
118+
echo ""
119+
cat <<'EOF'
120+
# patches/node/rei-thunderbolt.yaml
121+
machine:
122+
network:
123+
interfaces:
124+
- deviceSelector:
125+
busPath: "0-1.0" # Replace with actual discovered bus path
126+
dhcp: false
127+
mtu: 65520
128+
addresses:
129+
- 169.254.255.101/32
130+
routes:
131+
- network: 169.254.255.102/32
132+
metric: 2048
133+
- deviceSelector:
134+
busPath: "0-3.0" # Replace with actual discovered bus path
135+
dhcp: false
136+
mtu: 65520
137+
addresses:
138+
- 169.254.255.101/32
139+
routes:
140+
- network: 169.254.255.103/32
141+
metric: 2048
142+
EOF
143+
144+
dashboard:
145+
desc: Open Talos dashboard
146+
cmd: talosctl dashboard --nodes rei,asuka,kaji
147+
148+
health:
149+
desc: Check cluster health
150+
cmd: talosctl health --server=false
151+
152+
reset:
153+
desc: Reset nodes back to maintenance mode (DESTRUCTIVE)
154+
prompt: This will destroy your cluster! Continue?
155+
cmd: talhelper gencommand reset --extra-flags="--reboot --system-labels-to-wipe STATE --system-labels-to-wipe EPHEMERAL --graceful=false --wait=false" | bash
156+
157+
upgrade:
158+
desc: Upgrade Talos on a node
159+
cmd: talosctl --nodes {{.node}} upgrade --image {{.image}} --wait=true --timeout=10m --preserve=true
160+
requires:
161+
vars: ["node", "image"]
162+
preconditions:
163+
- msg: Unable to connect to node
164+
sh: talosctl --nodes {{.node}} version
165+
166+
upgrade-k8s:
167+
desc: Upgrade Kubernetes across the cluster
168+
cmd: talosctl --nodes {{.controller}} upgrade-k8s --to {{.to}}
169+
requires:
170+
vars: ["controller", "to"]
171+
preconditions:
172+
- msg: Unable to connect to controller
173+
sh: talosctl --nodes {{.controller}} version
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
---
2+
apiVersion: apps/v1
3+
kind: DaemonSet
4+
metadata:
5+
name: irqbalance
6+
namespace: kube-system
7+
labels:
8+
app: irqbalance
9+
spec:
10+
selector:
11+
matchLabels:
12+
app: irqbalance
13+
template:
14+
metadata:
15+
labels:
16+
app: irqbalance
17+
spec:
18+
hostNetwork: true
19+
hostPID: true
20+
hostIPC: true
21+
containers:
22+
- name: irqbalance
23+
image: ghcr.io/home-operations/irqbalance:1.9.4@sha256:86f83ccf82033339f19981697f947d96194539d6b130fa5a4336e887461fe7dc
24+
command: ["/usr/sbin/irqbalance"]
25+
args: ["--foreground", "--journal"]
26+
env:
27+
- name: IRQBALANCE_BANNED_CPULIST
28+
value: "12-19"
29+
resources:
30+
requests:
31+
cpu: 25m
32+
memory: 64Mi
33+
limits:
34+
memory: 128Mi
35+
securityContext:
36+
privileged: true
37+
volumeMounts:
38+
- name: run
39+
mountPath: /run/irqbalance
40+
volumes:
41+
- name: run
42+
emptyDir: {}
43+
tolerations:
44+
- key: node-role.kubernetes.io/control-plane
45+
operator: Exists
46+
effect: NoSchedule
47+
nodeSelector:
48+
node-role.kubernetes.io/control-plane: ""

0 commit comments

Comments
 (0)