Skip to content

Commit caef0b4

Browse files
adrianrioboclaude
andcommitted
fix(gitlab): fix nested-container DNS for Podman builds across platforms
Resolves DNS failures in nested container builds inside GitLab CI executor containers on ppc64le, s390x, Ubuntu, and RHEL9: - detect real upstream DNS servers via resolvectl/nmcli/resolv.conf and write them to containers.conf dns_servers so inner Podman containers get working nameservers (resolv.conf stubs like 127.0.0.53 are unreachable from inside nested containers) - replace netns=host workaround with ip_forward=1 so Netavark bridge NAT works correctly, giving each inner container an isolated namespace while still routing egress through the executor container's interface - persist ip_forward sysctl and firewalld masquerade rules across reboots - use non-conflicting Netavark subnet (10.88.x) to avoid IP space conflicts - use a separate executor containers.conf to fix nested DNS on ppc64/RHEL9 - fix Ubuntu executor DNS by passing --docker-dns at runner registration - increase runner output_limit to 64 MB to prevent log truncation Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 2bdff20 commit caef0b4

1 file changed

Lines changed: 155 additions & 1 deletion

File tree

pkg/integrations/gitlab/snippet-linux.sh

Lines changed: 155 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,168 @@ sudo restorecon -v /usr/bin/gitlab-runner 2>/dev/null || true
1414
# Enable Podman socket so the docker executor can reach it
1515
sudo systemctl enable --now podman.socket
1616

17+
# Detect the host's upstream DNS servers and propagate them into every Podman
18+
# container (including nested build containers created by `podman build`).
19+
# Without this, inner build containers inherit a loopback stub address
20+
# (127.0.0.53 / systemd-resolved) that is unreachable from inside a container,
21+
# causing DNS resolution failures like "Could not resolve host: github.com".
22+
_dns_servers=""
23+
if command -v resolvectl &>/dev/null; then
24+
_dns_servers=$(resolvectl dns 2>/dev/null \
25+
| awk '{for(i=2;i<=NF;i++) print $i}' \
26+
| grep -E '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$' \
27+
| sort -u | tr '\n' ' ' | xargs)
28+
fi
29+
if [ -z "$_dns_servers" ] && command -v nmcli &>/dev/null; then
30+
_dns_servers=$(nmcli dev show 2>/dev/null \
31+
| awk '/IP4\.DNS/ {print $2}' \
32+
| tr '\n' ' ' | xargs)
33+
fi
34+
# On systemd-resolved systems (Ubuntu), /run/systemd/resolve/resolv.conf holds
35+
# the real upstream DNS servers (not the 127.0.0.53 stub in /etc/resolv.conf).
36+
if [ -z "$_dns_servers" ]; then
37+
_dns_servers=$(awk '/^nameserver/ && $2 !~ /^127\./ && $2 != "::1" {print $2}' \
38+
/run/systemd/resolve/resolv.conf 2>/dev/null \
39+
| tr '\n' ' ' | xargs)
40+
fi
41+
if [ -z "$_dns_servers" ]; then
42+
_dns_servers=$(awk '/^nameserver/ && $2 !~ /^127\./ && $2 != "::1" {print $2}' /etc/resolv.conf \
43+
| tr '\n' ' ' | xargs)
44+
fi
45+
# Last-resort fallback: if no local DNS could be detected, use public resolvers.
46+
# The machine must have internet access (it talks to GitLab), so these will work.
47+
if [ -z "$_dns_servers" ]; then
48+
_dns_servers="8.8.8.8 8.8.4.4"
49+
fi
50+
# Build --docker-dns flags for runner registration so every job container gets
51+
# working DNS servers even when Podman's Docker socket API does not honour
52+
# containers.conf dns_servers (which affects executor-container resolution).
53+
_docker_dns_args=()
54+
for _ip in $_dns_servers; do
55+
_docker_dns_args+=(--docker-dns "$_ip")
56+
done
57+
58+
if [ -n "$_dns_servers" ]; then
59+
_toml_list=""
60+
for _ip in $_dns_servers; do
61+
[ -n "$_toml_list" ] && _toml_list="${_toml_list}, "
62+
_toml_list="${_toml_list}\"${_ip}\""
63+
done
64+
sudo mkdir -p /etc/containers
65+
if [ ! -f /etc/containers/containers.conf ]; then
66+
printf '[containers]\ndns_servers = [%s]\ndns_options = ["timeout:2", "attempts:5", "single-request"]\n' \
67+
"$_toml_list" | sudo tee /etc/containers/containers.conf > /dev/null
68+
elif grep -q '^\[containers\]' /etc/containers/containers.conf; then
69+
# Scope the dns_servers check to the [containers] section only
70+
if awk '/^\[containers\]/{f=1;next} /^\[/{f=0} f && /^dns_servers/{found=1} END{exit !found}' \
71+
/etc/containers/containers.conf; then
72+
# Replace dns_servers only within [containers]
73+
awk -v "val=dns_servers = [${_toml_list}]" \
74+
'/^\[containers\]/{s=1} /^\[/ && !/^\[containers\]/{s=0}
75+
s && /^dns_servers/{$0=val} 1' \
76+
/etc/containers/containers.conf \
77+
| sudo tee /etc/containers/containers.conf.tmp > /dev/null \
78+
&& sudo mv /etc/containers/containers.conf.tmp /etc/containers/containers.conf
79+
else
80+
sudo sed -i "/^\[containers\]/a dns_servers = [${_toml_list}]" \
81+
/etc/containers/containers.conf
82+
fi
83+
# Add or update dns_options within [containers]
84+
if grep -q '^dns_options' /etc/containers/containers.conf; then
85+
sudo sed -i 's|^dns_options.*|dns_options = ["timeout:2", "attempts:5", "single-request"]|' \
86+
/etc/containers/containers.conf
87+
else
88+
sudo sed -i '/^\[containers\]/a dns_options = ["timeout:2", "attempts:5", "single-request"]' \
89+
/etc/containers/containers.conf
90+
fi
91+
else
92+
printf '\n[containers]\ndns_servers = [%s]\ndns_options = ["timeout:2", "attempts:5", "single-request"]\n' \
93+
"$_toml_list" | sudo tee -a /etc/containers/containers.conf > /dev/null
94+
fi
95+
# Ensure the file is world-readable so rootless Podman can also load it
96+
sudo chmod 644 /etc/containers/containers.conf
97+
fi
98+
99+
# Guarantee the file exists even when DNS detection found nothing, so that the
100+
# volume mount added to the runner below always has a real file to bind.
101+
sudo mkdir -p /etc/containers
102+
if [ ! -f /etc/containers/containers.conf ]; then
103+
printf '[containers]\n' | sudo tee /etc/containers/containers.conf > /dev/null
104+
sudo chmod 644 /etc/containers/containers.conf
105+
fi
106+
107+
{{- if .LogToJournald}}
108+
# Set journald as the container log driver so CI job output is captured by the
109+
# systemd journal and can be correlated with runner daemon logs via job_id.
110+
sudo mkdir -p /etc/containers
111+
if [ ! -f /etc/containers/containers.conf ]; then
112+
printf '[containers]\nlog_driver = "journald"\n' \
113+
| sudo tee /etc/containers/containers.conf > /dev/null
114+
elif grep -q '^\[containers\]' /etc/containers/containers.conf; then
115+
if awk '/^\[containers\]/{f=1;next} /^\[/{f=0} f && /^log_driver/{found=1} END{exit !found}' \
116+
/etc/containers/containers.conf; then
117+
# Replace existing log_driver within [containers]
118+
awk '/^\[containers\]/{s=1} /^\[/ && !/^\[containers\]/{s=0}
119+
s && /^log_driver/{$0="log_driver = \"journald\""} 1' \
120+
/etc/containers/containers.conf \
121+
| sudo tee /etc/containers/containers.conf.tmp > /dev/null \
122+
&& sudo mv /etc/containers/containers.conf.tmp /etc/containers/containers.conf
123+
else
124+
sudo sed -i '/^\[containers\]/a log_driver = "journald"' \
125+
/etc/containers/containers.conf
126+
fi
127+
else
128+
printf '\n[containers]\nlog_driver = "journald"\n' \
129+
| sudo tee -a /etc/containers/containers.conf > /dev/null
130+
fi
131+
sudo chmod 644 /etc/containers/containers.conf
132+
{{- end}}
133+
134+
# Create an executor-specific containers.conf that adds a non-conflicting inner
135+
# subnet for nested Netavark networks. The host containers.conf intentionally
136+
# omits [network] so the host Podman bridge keeps its default 10.88.0.0/16.
137+
# The executor copy adds default_subnet = 192.168.100.0/24 so that Netavark
138+
# inside a privileged executor container creates a bridge in a different subnet,
139+
# eliminating the duplicate-route conflict that breaks DNS in nested containers
140+
# on Netavark-based hosts (RHEL 9 / ppc64le).
141+
sudo cp /etc/containers/containers.conf /etc/containers/executor-containers.conf
142+
printf '\n[network]\ndefault_subnet = "192.168.100.0/24"\n' \
143+
| sudo tee -a /etc/containers/executor-containers.conf > /dev/null
144+
sudo chmod 644 /etc/containers/executor-containers.conf
145+
146+
# Enable IP forwarding so Netavark can NAT containers through the host's
147+
# network interface. Persist via sysctl.d so the setting survives reboots.
148+
printf 'net.ipv4.ip_forward = 1\nnet.ipv4.conf.all.forwarding = 1\n' \
149+
| sudo tee /etc/sysctl.d/99-podman-ip-forward.conf > /dev/null
150+
sudo sysctl -w net.ipv4.ip_forward=1
151+
sudo sysctl -w net.ipv4.conf.all.forwarding=1
152+
153+
# Ensure NAT masquerade is active for the Podman bridge subnet.
154+
# On RHEL/firewalld systems, Netavark normally configures this, but
155+
# 'podman system reset' can leave firewalld without the masquerade rule
156+
# until the first container is actually created — too late for the runner
157+
# to resolve DNS at job startup. We add the rule explicitly so it is in
158+
# place before any job container tries to reach an external DNS server.
159+
sudo iptables -t nat -A POSTROUTING \
160+
-s 10.88.0.0/16 ! -d 10.88.0.0/16 -j MASQUERADE 2>/dev/null || true
161+
# On firewalld systems (RHEL/Fedora), enable masquerade permanently so it
162+
# survives firewalld restarts and reboots, then reload to activate immediately.
163+
sudo firewall-cmd --permanent --add-masquerade 2>/dev/null || true
164+
sudo firewall-cmd --reload 2>/dev/null || true
165+
17166
# Register runner using docker executor backed by Podman
18167
# --docker-privileged is required for Podman: containers need CAP_SYS_ADMIN to mount /proc
19168
sudo gitlab-runner register \
20169
--non-interactive \
21170
--url "{{ .RepoURL }}" \
22171
--token "{{ .Token }}" \
172+
--name "{{ .Name }}" \
23173
--executor "docker" \
24174
--docker-image "fedora:latest" \
25175
--docker-host "unix:///run/podman/podman.sock" \
26-
--docker-privileged
176+
--docker-privileged \
177+
"${_docker_dns_args[@]}" \
178+
--docker-volumes "/etc/containers/executor-containers.conf:/etc/containers/containers.conf:ro"
27179

28180
{{- if not .Unsecure}}
29181
# Create a dedicated system user for running CI jobs
@@ -43,5 +195,7 @@ sudo gitlab-runner install --user="${RUNNER_USER}"
43195
{{- if .Concurrent}}
44196
sudo sed -i "s/^concurrent = .*/concurrent = {{.Concurrent}}/" /etc/gitlab-runner/config.toml
45197
{{- end}}
198+
# Increase per-runner log limit (default 4 MB is too small for long builds like PyTorch)
199+
sudo sed -i '/^\[\[runners\]\]/a\ output_limit = 65536' /etc/gitlab-runner/config.toml
46200
sudo systemctl daemon-reload
47201
sudo systemctl enable --now gitlab-runner

0 commit comments

Comments
 (0)