Skip to content

Commit 0dfdff8

Browse files
authored
feat: reduce node impact by aks-log-collector (#8598)
1 parent 5d17e2c commit 0dfdff8

5 files changed

Lines changed: 137 additions & 94 deletions

File tree

parts/linux/cloud-init/artifacts/aks-log-collector.sh

Lines changed: 126 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,13 @@ shopt -s nullglob nocaseglob extglob
1919
# JSON body:
2020
# {
2121
# "disable": false,
22+
# "waagent_full": false,
2223
# "files": [ "/etc/skel/.bashrc", "/etc/skel/.bash_profile" ],
2324
# "pod_log_namespaces": [ "default", "pahealy" ],
2425
# "iptables": false,
2526
# "nftables": false,
26-
# "netns": true
27+
# "netns": true,
28+
# "sysinfo": false
2729
# }
2830
CONFIG=$(curl -s -H Metadata:true --noproxy '*' 'http://169.254.169.254/metadata/instance/compute?api-version=2021-02-01' | jq '.tagsList[] | select(.name=="aks-log-collector") | .value | fromjson')
2931

@@ -36,6 +38,8 @@ CONFIG=$(curl -s -H Metadata:true --noproxy '*' 'http://169.254.169.254/metadata
3638
COLLECT_IPTABLES=$(<<<"$CONFIG" jq -esRr 'try fromjson catch null | .iptables? // false')
3739
COLLECT_NFTABLES=$(<<<"$CONFIG" jq -esRr 'try fromjson catch null | .nftables? // false')
3840
COLLECT_NETNS=$(<<<"$CONFIG" jq -esRr 'try fromjson catch null | .netns? // false')
41+
COLLECT_SYSINFO=$(<<<"$CONFIG" jq -esRr 'try fromjson catch null | .sysinfo? // false')
42+
COLLECT_WAAGENT_FULL=$(<<<"$CONFIG" jq -esRr 'try fromjson catch null | .waagent_full? // false')
3943

4044
### START CONFIGURATION
4145
ZIP="aks_logs.zip"
@@ -75,63 +79,71 @@ GLOBS+=(/var/log/nvidia*.log)
7579
GLOBS+=(/var/log/azure/nvidia*.log)
7680
GLOBS+=(/var/log/fabricmanager*.log)
7781

78-
# based on MANIFEST_FULL from Azure Linux Agent's log collector
79-
# https://github.com/Azure/WALinuxAgent/blob/master/azurelinuxagent/common/logcollector_manifests.py
80-
GLOBS+=(/var/lib/waagent/provisioned)
81-
GLOBS+=(/etc/fstab)
82-
GLOBS+=(/etc/ssh/sshd_config)
83-
GLOBS+=(/boot/grub*/grub.c*)
84-
GLOBS+=(/boot/grub*/menu.lst)
82+
# Configuration files (small, critical for diagnosis)
8583
GLOBS+=(/etc/*-release)
8684
GLOBS+=(/etc/HOSTNAME)
8785
GLOBS+=(/etc/hostname)
88-
GLOBS+=(/etc/apt/sources.list)
89-
GLOBS+=(/etc/apt/sources.list.d/*)
90-
GLOBS+=(/etc/network/interfaces)
91-
GLOBS+=(/etc/network/interfaces.d/*.cfg)
92-
GLOBS+=(/etc/netplan/*.yaml)
93-
GLOBS+=(/etc/nsswitch.conf)
86+
GLOBS+=(/etc/waagent.conf)
9487
GLOBS+=(/etc/resolv.conf)
9588
GLOBS+=(/run/systemd/resolve/stub-resolv.conf)
96-
GLOBS+=(/run/resolvconf/resolv.conf)
97-
GLOBS+=(/etc/sysconfig/iptables)
98-
GLOBS+=(/etc/sysconfig/network)
99-
GLOBS+=(/etc/sysconfig/network/ifcfg-eth*)
100-
GLOBS+=(/etc/sysconfig/network/routes)
101-
GLOBS+=(/etc/sysconfig/network-scripts/ifcfg-eth*)
102-
GLOBS+=(/etc/sysconfig/network-scripts/route-eth*)
103-
GLOBS+=(/etc/ufw/ufw.conf)
104-
GLOBS+=(/etc/waagent.conf)
105-
GLOBS+=(/var/lib/hyperv/.kvp_pool_*)
106-
GLOBS+=(/var/lib/dhcp/dhclient.eth0.leases)
107-
GLOBS+=(/var/lib/dhclient/dhclient-eth0.leases)
108-
GLOBS+=(/var/lib/wicked/lease-eth0-dhcp-ipv4.xml)
89+
90+
# Key log files
91+
GLOBS+=(/var/log/dmesg*)
92+
GLOBS+=(/var/log/syslog*)
93+
GLOBS+=(/var/log/messages*)
94+
GLOBS+=(/var/log/secure*)
95+
GLOBS+=(/var/log/auth*)
96+
GLOBS+=(/var/log/cloud-init*)
97+
GLOBS+=(/var/log/azure/*/*)
98+
GLOBS+=(/var/log/azure/*/*/*)
10999
GLOBS+=(/var/log/azure/custom-script/handler.log)
110100
GLOBS+=(/var/log/azure/run-command/handler.log)
101+
102+
# Extension state
111103
GLOBS+=(/var/lib/waagent/ovf-env.xml)
104+
GLOBS+=(/var/lib/waagent/waagent_status.json)
112105
GLOBS+=(/var/lib/waagent/*/status/*.status)
113106
GLOBS+=(/var/lib/waagent/*/config/*.settings)
114107
GLOBS+=(/var/lib/waagent/*/config/HandlerState)
115108
GLOBS+=(/var/lib/waagent/*/config/HandlerStatus)
116-
GLOBS+=(/var/lib/waagent/SharedConfig.xml)
117-
GLOBS+=(/var/lib/waagent/ManagedIdentity-*.json)
118-
GLOBS+=(/var/lib/waagent/waagent_status.json)
119109
GLOBS+=(/var/lib/waagent/*/error.json)
120-
GLOBS+=(/var/log/cloud-init*)
121-
GLOBS+=(/var/log/azure/*/*)
122-
GLOBS+=(/var/log/azure/*/*/*)
123-
GLOBS+=(/var/log/syslog*)
124-
GLOBS+=(/var/log/rsyslog*)
125-
GLOBS+=(/var/log/messages*)
126-
GLOBS+=(/var/log/kern*)
127-
GLOBS+=(/var/log/dmesg*)
128-
GLOBS+=(/var/log/dpkg*)
129-
GLOBS+=(/var/log/yum*)
130-
GLOBS+=(/var/log/boot*)
131-
GLOBS+=(/var/log/auth*)
132-
GLOBS+=(/var/log/secure*)
133-
GLOBS+=(/var/log/journal*)
134110

111+
# Based on MANIFEST_FULL from Azure Linux Agent's log collector
112+
# https://github.com/Azure/WALinuxAgent/blob/master/azurelinuxagent/ga/logcollector_manifests.py
113+
if [ "$COLLECT_WAAGENT_FULL" = "true" ]; then
114+
GLOBS+=(/var/lib/waagent/provisioned)
115+
GLOBS+=(/etc/fstab)
116+
GLOBS+=(/etc/ssh/sshd_config)
117+
GLOBS+=(/boot/grub*/grub.c*)
118+
GLOBS+=(/boot/grub*/menu.lst)
119+
GLOBS+=(/etc/apt/sources.list)
120+
GLOBS+=(/etc/apt/sources.list.d/*)
121+
GLOBS+=(/etc/network/interfaces)
122+
GLOBS+=(/etc/network/interfaces.d/*.cfg)
123+
GLOBS+=(/etc/netplan/*.yaml)
124+
GLOBS+=(/etc/nsswitch.conf)
125+
GLOBS+=(/run/resolvconf/resolv.conf)
126+
GLOBS+=(/etc/sysconfig/iptables)
127+
GLOBS+=(/etc/sysconfig/network)
128+
GLOBS+=(/etc/sysconfig/network/ifcfg-eth*)
129+
GLOBS+=(/etc/sysconfig/network/routes)
130+
GLOBS+=(/etc/sysconfig/network-scripts/ifcfg-eth*)
131+
GLOBS+=(/etc/sysconfig/network-scripts/route-eth*)
132+
GLOBS+=(/etc/ufw/ufw.conf)
133+
GLOBS+=(/var/lib/hyperv/.kvp_pool_*)
134+
GLOBS+=(/var/lib/dhcp/dhclient.eth0.leases)
135+
GLOBS+=(/var/lib/dhclient/dhclient-eth0.leases)
136+
GLOBS+=(/var/lib/wicked/lease-eth0-dhcp-ipv4.xml)
137+
GLOBS+=(/var/lib/waagent/SharedConfig.xml)
138+
GLOBS+=(/var/lib/waagent/ManagedIdentity-*.json)
139+
# Rotated and additional log files
140+
GLOBS+=(/var/log/rsyslog*)
141+
GLOBS+=(/var/log/kern*)
142+
GLOBS+=(/var/log/dpkg*)
143+
GLOBS+=(/var/log/yum*)
144+
GLOBS+=(/var/log/boot*)
145+
GLOBS+=(/var/log/journal*)
146+
fi
135147
### END CONFIGURATION
136148

137149
command -v zip >/dev/null || {
@@ -187,25 +199,30 @@ echo "Collecting system information..."
187199
mkdir collect
188200

189201
# Collect general information and create the ZIP in the first place
190-
zip -DZ deflate "${ZIP}" /proc/@(cmdline|cpuinfo|filesystems|interrupts|loadavg|meminfo|modules|mounts|slabinfo|stat|uptime|version*|vmstat) /proc/net/*
191-
192-
# Include some disk listings
193-
collectToZip collect/file_listings.txt find /dev /etc /var/lib/waagent /var/log -ls
194-
195-
# Collect system information
196-
collectToZip collect/blkid.txt blkid $(find /dev -type b ! -name 'sr*')
197-
collectToZip collect/du_bytes.txt df -al
198-
collectToZip collect/du_inodes.txt df -ail
199-
collectToZip collect/diskinfo.txt lsblk
200-
collectToZip collect/lscpu.txt lscpu
201-
collectToZip collect/lscpu.json lscpu -J
202-
collectToZip collect/lsipc.txt lsipc
203-
collectToZip collect/lsns.json lsns -J --output-all
204-
collectToZip collect/lspci.txt lspci -vkPP
205-
collectToZip collect/lsscsi.txt lsscsi -vv
206-
collectToZip collect/lsvmbus.txt lsvmbus -vv
207-
collectToZip collect/sysctl.txt sysctl -a
208-
collectToZip collect/systemctl-status.txt systemctl status --all -fr
202+
zip -DZ deflate "${ZIP}" /proc/@(cmdline|loadavg|meminfo|mounts|uptime|version*)
203+
204+
if [ "$COLLECT_SYSINFO" = "true" ]; then
205+
# Extensive proc info
206+
zip -gDZ deflate "${ZIP}" /proc/@(cpuinfo|filesystems|interrupts|modules|slabinfo|stat|vmstat) /proc/net/*
207+
208+
# Include some disk listings
209+
collectToZip collect/file_listings.txt find /dev /etc /var/lib/waagent /var/log -ls
210+
211+
# Collect system information
212+
collectToZip collect/blkid.txt blkid $(find /dev -type b ! -name 'sr*')
213+
collectToZip collect/du_bytes.txt df -al
214+
collectToZip collect/du_inodes.txt df -ail
215+
collectToZip collect/diskinfo.txt lsblk
216+
collectToZip collect/lscpu.txt lscpu
217+
collectToZip collect/lscpu.json lscpu -J
218+
collectToZip collect/lsipc.txt lsipc
219+
collectToZip collect/lsns.json lsns -J --output-all
220+
collectToZip collect/lspci.txt lspci -vkPP
221+
collectToZip collect/lsscsi.txt lsscsi -vv
222+
collectToZip collect/lsvmbus.txt lsvmbus -vv
223+
collectToZip collect/sysctl.txt sysctl -a
224+
collectToZip collect/systemctl-status.txt systemctl status --all -fr
225+
fi
209226

210227
# Collect logs of the Nvidia services if present
211228
collectToZip collect/journalctl_nvidia-dcgm.txt journalctl -u nvidia-dcgm --no-pager
@@ -223,12 +240,16 @@ collectToZip collect/crictl_images.json crictl images -o json
223240
collectToZip collect/crictl_imagefsinfo.json crictl imagefsinfo -o json
224241
collectToZip collect/crictl_pods.json crictl pods -o json
225242
collectToZip collect/crictl_ps.json crictl ps -o json
226-
collectToZip collect/crictl_stats.json crictl stats -o json
227-
collectToZip collect/crictl_statsp.json crictl statsp -o json
243+
if [ "$COLLECT_SYSINFO" = "true" ]; then
244+
collectToZip collect/crictl_stats.json crictl stats -o json
245+
collectToZip collect/crictl_statsp.json crictl statsp -o json
246+
fi
228247

229248
# Collect network information
230-
collectToZip collect/conntrack.txt conntrack -L
231-
collectToZip collect/conntrack_stats.txt conntrack -S
249+
if [ "$COLLECT_SYSINFO" = "true" ]; then
250+
collectToZip collect/conntrack.txt conntrack -L
251+
collectToZip collect/conntrack_stats.txt conntrack -S
252+
fi
232253
collectToZip collect/ip_4_addr.json ip -4 -d -j addr show
233254
collectToZip collect/ip_4_neighbor.json ip -4 -d -j neighbor show
234255
collectToZip collect/ip_4_route.json ip -4 -d -j route show
@@ -251,15 +272,19 @@ if [ "${COLLECT_NFTABLES}" = "true" ]; then
251272
collectToZip collect/nftables.txt nft -n list ruleset 2>&1
252273
fi
253274

254-
collectToZip collect/ss.txt ss -anoempiO --cgroup
255-
collectToZip collect/ss_stats.txt ss -s
275+
if [ "$COLLECT_SYSINFO" = "true" ]; then
276+
collectToZip collect/ss.txt ss -anoempiO --cgroup
277+
collectToZip collect/ss_stats.txt ss -s
278+
fi
256279

257280
# Collect network information from network namespaces
258281
if [ "${COLLECT_NETNS}" = "true" ]; then
259282
for NETNS in $(ip -j netns list | jq -r '.[].name'); do
260283
mkdir -p "collect/ip_netns_${NETNS}/"
261-
collectToZip collect/ip_netns_${NETNS}/conntrack.txt ip netns exec "${NETNS}" conntrack -L
262-
collectToZip collect/ip_netns_${NETNS}/conntrack_stats.txt ip netns exec "${NETNS}" conntrack -S
284+
if [ "$COLLECT_SYSINFO" = "true" ]; then
285+
collectToZip collect/ip_netns_${NETNS}/conntrack.txt ip netns exec "${NETNS}" conntrack -L
286+
collectToZip collect/ip_netns_${NETNS}/conntrack_stats.txt ip netns exec "${NETNS}" conntrack -S
287+
fi
263288
collectToZip collect/ip_netns_${NETNS}/ip_4_addr.json ip -n "${NETNS}" -4 -d -j addr show
264289
collectToZip collect/ip_netns_${NETNS}/ip_4_neighbor.json ip -n "${NETNS}" -4 -d -j neighbor show
265290
collectToZip collect/ip_netns_${NETNS}/ip_4_route.json ip -n "${NETNS}" -4 -d -j route show
@@ -279,27 +304,42 @@ if [ "${COLLECT_NETNS}" = "true" ]; then
279304
if [ "${COLLECT_NFTABLES}" = "true" ]; then
280305
collectToZip collect/ip_netns_${NETNS}/nftables.txt nft -n list ruleset
281306
fi
282-
collectToZip collect/ip_netns_${NETNS}/ss.txt ip netns exec "${NETNS}" ss -anoempiO --cgroup
283-
collectToZip collect/ip_netns_${NETNS}/ss_stats.txt ip netns exec "${NETNS}" ss -s
307+
if [ "$COLLECT_SYSINFO" = "true" ]; then
308+
collectToZip collect/ip_netns_${NETNS}/ss.txt ip netns exec "${NETNS}" ss -anoempiO --cgroup
309+
collectToZip collect/ip_netns_${NETNS}/ss_stats.txt ip netns exec "${NETNS}" ss -s
310+
fi
284311
done
285312
fi
286313

287314
# Add each file sequentially to the zip archive. This is slightly less efficient then adding them
288315
# all at once, but allows us to easily check when we've exceeded the maximum file size and stop
289316
# adding things to the archive.
290-
echo "Adding log files to zip archive..."
291-
for file in ${GLOBS[*]}; do
292-
if test -e $file; then
293-
zip -g -DZ deflate -u "${ZIP}" $file -x '*.sock'
294-
295-
# The API for the log bundle has a max file size (defined above, usually 100MB), so if
296-
# adding this last file made the zip go over that size, remove that file and stop processing.
297-
FILE_SIZE=$(stat --printf "%s" ${ZIP})
298-
if [ "$FILE_SIZE" -ge "$MAX_SIZE" ]; then
299-
echo "WARNING: ZIP file size $FILE_SIZE >= $MAX_SIZE; removing last log file and terminating adding more files."
300-
zip -d "${ZIP}" $file
301-
break
302-
fi
317+
MAX_FILE_SIZE=$((10 * 1024 * 1024))
318+
echo "Adding log files to zip archive with max file size: $MAX_FILE_SIZE bytes..."
319+
for file in "${GLOBS[@]}"; do
320+
# shellcheck disable=SC3010
321+
[[ "$file" == *.gz ]] && continue
322+
test -e "$file" || continue
323+
324+
fsize=$(stat --printf "%s" "$file")
325+
if [ "$fsize" -gt "$MAX_FILE_SIZE" ]; then
326+
# Preserve directory structure so zip entry has the original path
327+
truncdir="${file%/*}"
328+
mkdir -p ".${truncdir}"
329+
mkfifo ".${file}"
330+
tail -c "$MAX_FILE_SIZE" "$file" >".${file}" &
331+
tail_pid=$!
332+
zip -gDZ deflate --fifo "${ZIP}" ".${file}"
333+
wait "$tail_pid" 2>/dev/null
334+
rm -f ".${file}"
335+
else
336+
zip -g -DZ deflate -u "${ZIP}" "$file" -x '*.sock'
337+
fi
338+
339+
FILE_SIZE=$(stat --printf "%s" "${ZIP}")
340+
if [ "$FILE_SIZE" -ge "$MAX_SIZE" ]; then
341+
echo "WARNING: ZIP file size $FILE_SIZE >= $MAX_SIZE; stopping."
342+
break
303343
fi
304344
done
305345

parts/linux/cloud-init/artifacts/aks-log-collector.timer

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@
22
Description=AKS Log Collector Timer
33

44
[Timer]
5-
OnActiveSec=0m
6-
OnBootSec=5min
5+
OnActiveSec=10min
76
OnUnitActiveSec=60m
87

98
[Install]

parts/linux/cloud-init/artifacts/cse_main.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -586,6 +586,12 @@ function nodePrep {
586586

587587
checkServiceHealth kubelet || exit $ERR_KUBELET_FAIL
588588

589+
if systemctl cat aks-log-collector.timer &>/dev/null; then
590+
systemctlEnableAndStartNoBlock aks-log-collector.timer 30 || echo "Warning: Could not start aks-log-collector.timer"
591+
else
592+
echo "aks-log-collector.timer not found on this VHD, skipping"
593+
fi
594+
589595
if $REBOOTREQUIRED; then
590596
echo 'reboot required, rebooting node in 1 minute'
591597
/bin/bash -c "shutdown -r 1 &"

parts/linux/cloud-init/artifacts/cse_start.sh

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,7 @@ echo ${EVENT_JSON} > ${EVENTS_LOGGING_DIR}${EVENTS_FILE_NAME}.json
109109

110110
# force a log upload to the host after the provisioning script finishes
111111
# if we failed, wait for the upload to complete so that we don't remove
112-
# the VM before it finishes. if we succeeded, upload in the background
113-
# so that the provisioning script returns success more quickly
112+
# the VM before it finishes.
114113
upload_logs() {
115114
# if the VHD has the AKS log collector installed, use it instead. Otherwise
116115
# fall back to WALA collector
@@ -127,8 +126,6 @@ upload_logs() {
127126
}
128127
if [ "$EXIT_CODE" -ne 0 ]; then
129128
upload_logs
130-
else
131-
upload_logs &
132129
fi
133130

134131
exit "$EXIT_CODE"

vhdbuilder/packer/pre-install-dependencies.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,10 @@ if isFlatcar "$OS" || isACL "$OS" "$OS_VARIANT"; then
6666
cp /etc/waagent.conf{,.new}
6767
mv /etc/waagent.conf{.new,}
6868
fi
69-
# enable AKS log collector
69+
# disable AKS log collector and waagent collection
7070
echo -e "\n# Disable WALA log collection because AKS Log Collector is installed.\nLogs.Collect=n" >> /etc/waagent.conf || exit 1
71-
systemctlEnableAndStart aks-log-collector.timer 30 || exit 1
71+
systemctl disable --now aks-log-collector.service || exit 1
72+
systemctl disable --now aks-log-collector.timer || exit 1
7273

7374
# enable the modified logrotate service and remove the auto-generated default logrotate cron job if present
7475
systemctlEnableAndStart logrotate.timer 30 || exit 1

0 commit comments

Comments
 (0)