Skip to content

Commit c3a89b5

Browse files
authored
Address possible issue with skewed timestamps in system log after (re)boot with power loss (#9)
* Add tests for log with skewed timestamps * fix: introduce workaround for aligning timestamps in boot log See #5 for details. * Extend tests, add comments * loki_exporter: use mktemp instead of static dir * loki_exporter: put curl into verbose mode for tests * loki_exporter: clean up curl options * loki_exporter: adjust curl headers once again In particular: 1. Tests do not require authorization, so it has been turned off. 2. Added "Connection: close" header to all curl invocations to avoid unnecessary log warnings in the interim proxies like nginx if HTTP keepalives are enabled.
1 parent 686e896 commit c3a89b5

File tree

6 files changed

+260
-33
lines changed

6 files changed

+260
-33
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,7 @@ __pycache__
66
*.pyo
77
create-test-env
88
run-test-exporter-onetime
9+
run-test-exporter-timeshifted-onetime
910
tests/*.log.state
11+
tests/default-timeshifted.log
1012
loki-exporter/

Makefile

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,9 @@ run-test-exporter-boot: test-env ## Run exporter with mocking logread (BOOT=1)
142142
LOKI_AUTH_HEADER="none" \
143143
/bin/bash -u loki_exporter.sh
144144

145+
tests/default-timeshifted.log: tests/default.log
146+
$(TOPDIR)/tests/create_timeshifted_log.py >$@
147+
145148
run-test-exporter-onetime: create-test-env ## Run one-time cycle of mocking logread + exporter (BOOT=1)
146149
LOGREAD="./tests/logread.py" \
147150
BOOT=1 \
@@ -153,17 +156,31 @@ run-test-exporter-onetime: create-test-env ## Run one-time cycle of mocking logr
153156
/bin/bash -u loki_exporter.sh
154157
touch $@
155158

159+
run-test-exporter-timeshifted-onetime: tests/default-timeshifted.log create-test-env ## Run one-time cycle of mocking logread + exporter (BOOT=1 with time unsync emulation)
160+
LOGREAD="./tests/logread.py --log-file tests/default-timeshifted.log" \
161+
BOOT=1 \
162+
HOSTNAME="$(shell hostname).timeshifted" \
163+
LOKI_PUSH_URL="http://127.0.0.1:3100/loki/api/v1/push" \
164+
LOKI_AUTH_HEADER="none" \
165+
MAX_FOLLOW_CYCLES=3 \
166+
AUTOTEST=1 \
167+
/bin/bash -u loki_exporter.sh
168+
touch $@
169+
156170
.PHONY: test
157-
test: run-test-exporter-onetime | .venv results ## Run tests
171+
test: run-test-exporter-onetime run-test-exporter-timeshifted-onetime | .venv results ## Run tests
158172
$(TOPDIR)/.venv/bin/python3 -m pytest
159173

160174
.PHONY: compare-logs
161175
compare-logs: | results
162176
cat tests/default.log | cut -c 43- | sort >results/messages.original
163177
cat results/resurrected.log | cut -c 17- | sort >results/messages.resurrected
164-
wc -l results/messages.original results/messages.resurrected
165-
diff -u results/messages.original results/messages.resurrected ||:
166-
test $$(diff -u results/messages.original results/messages.resurrected | grep -- " (MOCK)$$" | wc -l) -eq 3
178+
cat results/resurrected-timeshifted.log | cut -c 17- | sort >results/messages.resurrected-timeshifted
179+
for target in messages.resurrected messages.resurrected-timeshifted; do \
180+
wc -l results/messages.original results/$${target} ; \
181+
diff -u results/messages.original results/$${target} ||: ; \
182+
test $$(diff -u results/messages.original results/$${target} | grep -- " (MOCK)$$" | wc -l) -eq 3 ; \
183+
done
167184

168185
.PHONY: save-logs
169186
save-logs: | results
@@ -261,7 +278,8 @@ clean: delete-test-env ## Clean-up
261278
find $(TOPDIR)/ -type f -name "*.pyo" -delete
262279
find $(TOPDIR)/ -type d -name "__pycache__" -delete
263280
rm -rf $(TOPDIR)/.pytest_cache
264-
rm -f $(TOPDIR)/run-test-exporter-onetime
281+
rm -f $(TOPDIR)/run-test-exporter-onetime $(TOPDIR)/run-test-exporter-timeshifted-onetime
265282
find $(TOPDIR)/tests/ -type f -name "*.log.state" -delete
283+
rm -f $(TOPDIR)/tests/default-timeshifted.log
266284
rm -rf $(TOPDIR)/results
267285
rm -rf $(TOPDIR)/loki-exporter

loki_exporter.sh

Lines changed: 127 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,28 @@
44
# ^^^ the above line is purely for shellcheck to treat this as a bash-like script
55
# (OpenWRT's ash from busybox is kinda similar but there still could be issues)
66

7-
PIPE_NAME="/tmp/loki_exporter.pipe"
8-
BULK_DATA="/tmp/loki_exporter.boot"
7+
_TMPDIR=$(mktemp -d -p /tmp loki_exporter.XXXXXX)
8+
PIPE_NAME="/${_TMPDIR}/loki_exporter.pipe"
9+
BULK_DATA="/${_TMPDIR}/loki_exporter.boot"
910

1011
LOKI_MSG_TEMPLATE="{\"streams\": [{\"stream\": {\"job\": \"openwrt_loki_exporter\", \"host\": \"${HOSTNAME}\"}, \"values\": [[\"TIMESTAMP\", \"MESSAGE\"]]}]}"
1112
LOKI_BULK_TEMPLATE_HEADER="{\"streams\": [{\"stream\": {\"job\": \"openwrt_loki_exporter\", \"host\": \"${HOSTNAME}\"}, \"values\": ["
1213
LOKI_BULK_TEMPLATE_MSG="[\"TIMESTAMP\", \"MESSAGE\"],"
1314
LOKI_BULK_TEMPLATE_FOOTER="]}]}"
1415

16+
DATETIME_STR_FORMAT="%a %b %d %H:%M:%S %Y"
17+
OS=$(uname -s | tr "[:upper:]" "[:lower:]")
18+
19+
if [ "${AUTOTEST-0}" -eq 1 ]; then
20+
_CURL_BULK_CMD=(curl --no-progress-meter -fv -H "Content-Type: application/json" -H "Content-Encoding: gzip" -H "Connection: close")
21+
_CURL_CMD=(curl --no-progress-meter -fv -H "Content-Type: application/json" -H "Connection: close")
22+
else
23+
_CURL_BULK_CMD=(curl -fsS -H "Content-Type: application/json" -H "Content-Encoding: gzip" -H "Authorization: Basic ${LOKI_AUTH_HEADER}" -H "Connection: close")
24+
_CURL_CMD=(curl -fsS -H "Content-Type: application/json" -H "Authorization: Basic ${LOKI_AUTH_HEADER}" -H "Connection: close")
25+
fi
26+
1527
_setup() {
16-
mkfifo ${PIPE_NAME}
28+
mkfifo "${PIPE_NAME}"
1729
echo "started with BOOT=${BOOT}" >&2
1830
}
1931

@@ -23,11 +35,17 @@ _teardown() {
2335
else
2436
echo "tailer ${tailer_pid} killed, exiting" >&2
2537
fi
26-
rm -f ${PIPE_NAME}
38+
rm -f "${PIPE_NAME}"
39+
if [ "${AUTOTEST-0}" -eq 1 ]; then
40+
mkdir -p results
41+
cp -r "${_TMPDIR}" results/
42+
fi
43+
rm -rf "${_TMPDIR}"
2744
exit 0
2845
}
2946

3047
_do_bulk_post() {
48+
_log_file="$1"
3149
post_body="${LOKI_BULK_TEMPLATE_HEADER}"
3250

3351
while read -r line; do
@@ -49,21 +67,109 @@ _do_bulk_post() {
4967
msg_payload="${msg_payload/MESSAGE/$msg}"
5068

5169
post_body="${post_body}${msg_payload}"
52-
done <${BULK_DATA}
70+
done <"${_log_file}"
5371

5472
post_body="${post_body:0:${#post_body}-1}${LOKI_BULK_TEMPLATE_FOOTER}"
55-
echo "${post_body}" | gzip >${BULK_DATA}.payload.gz
56-
rm -f ${BULK_DATA}
73+
echo "${post_body}" | gzip >"${_log_file}.payload.gz"
74+
rm -f "${_log_file}"
5775

58-
if curl -fs -X POST -H "Content-Type: application/json" -H "Content-Encoding: gzip" -H "Authorization: Basic ${LOKI_AUTH_HEADER}" --data-binary "@${BULK_DATA}.payload.gz" "${LOKI_PUSH_URL}"; then
59-
if [ "${AUTOTEST-0}" -eq 1 ]; then
60-
mkdir -p results
61-
cp ${BULK_DATA}.payload.gz results/
76+
if ! "${_CURL_BULK_CMD[@]}" --data-binary "@${_log_file}.payload.gz" "${LOKI_PUSH_URL}" >"${_log_file}.payload.gz-response" 2>&1; then
77+
echo "BULK POST FAILED: leaving ${_log_file}.payload.gz for now"
78+
fi
79+
}
80+
81+
_check_for_skewed_timestamp() {
82+
_log_file="$1"
83+
84+
# maximum threshold for comparing timestamps between 2 subsequent log lines (ns)
85+
delta_threshold=86400000000000
86+
87+
# incremental step for substituting timestamps of unsynchronized log lines (ns)
88+
step=25000000
89+
90+
# step 1: search for possible skewed timestamp
91+
prev_ts=0
92+
line_n=0
93+
line_n_synced=0
94+
while read -r line; do
95+
ts="${line:26:14}"
96+
ts_ms="${ts/./}"
97+
98+
# shellcheck disable=SC2116
99+
# subshell is required to handle multiplication errors and keep the loop
100+
if ! ts_ns="$(echo $(( ts_ms * 1000 * 1000 )) )" ; then
101+
continue
62102
fi
63-
rm -f ${BULK_DATA}.payload.gz
64-
else
65-
echo "BULK POST FAILED: leaving ${BULK_DATA}.payload.gz for now"
103+
104+
line_n=$((line_n + 1))
105+
106+
if [ "${prev_ts-0}" -eq 0 ]; then
107+
prev_ts=$ts_ns
108+
fi
109+
110+
delta_t=$((ts_ns - prev_ts))
111+
if [ $delta_t -ge $delta_threshold ]; then
112+
# found a line with timestamp delta exceeding a given threshold
113+
line_n_synced=$line_n
114+
break
115+
fi
116+
117+
prev_ts=$ts_ns
118+
done <"${_log_file}"
119+
120+
if [ $line_n_synced -eq 0 ]; then
121+
# no skew detected, nothing to do
122+
return
66123
fi
124+
125+
# skew detected, 1st synced line is $line_n_synced;
126+
# round new ts to nearest second
127+
ts_ns=$((ts_ns / 1000000000))
128+
ts_ns=$((ts_ns * 1000000000))
129+
new_ts=$((ts_ns - step * (line_n_synced-1)))
130+
131+
# step 2: re-create boot log with fake timestamps in appropriate range
132+
rm -f "${_log_file}.new"
133+
line_n=0
134+
while read -r line; do
135+
ts="${line:26:14}"
136+
ts_ms="${ts/./}"
137+
138+
# shellcheck disable=SC2116
139+
# subshell is required to handle multiplication errors and keep the loop
140+
if ! ts_ns="$(echo $(( ts_ms * 1000 * 1000 )) )" ; then
141+
continue
142+
fi
143+
144+
line_n=$((line_n + 1))
145+
146+
# for lines with valid timestamps, just print a line as is
147+
if [ $line_n -ge $line_n_synced ]; then
148+
printf "%s\n" "$line" >>"$1.new"
149+
continue
150+
fi
151+
152+
# otherwise, craft a new line
153+
msg="${line:42:2000}"
154+
155+
new_ts_s=$((new_ts / 1000000000))
156+
case "${OS}" in
157+
darwin)
158+
datetime_str=$(date -r "${new_ts_s}" +"${DATETIME_STR_FORMAT}")
159+
;;
160+
*)
161+
datetime_str=$(date -d @"${new_ts_s}" +"${DATETIME_STR_FORMAT}")
162+
;;
163+
esac
164+
165+
new_ts_ms_rounded=$((new_ts / 1000000))
166+
printf "%s [%s] %s\n" "${datetime_str}" "${new_ts_ms_rounded:0:10}.${new_ts_ms_rounded:10:13}" "${msg}" >>"${_log_file}.new"
167+
168+
# increase timestamp
169+
new_ts=$((new_ts + step))
170+
done <"${_log_file}"
171+
172+
mv "${_log_file}.new" "${_log_file}"
67173
}
68174

69175
_main_loop() {
@@ -78,7 +184,7 @@ _main_loop() {
78184
fi
79185
fi
80186

81-
${TAILER_CMD} >${PIPE_NAME} 2>&1 &
187+
${TAILER_CMD} >"${PIPE_NAME}" 2>&1 &
82188
tailer_pid=$!
83189

84190
while read -r line; do
@@ -105,21 +211,21 @@ _main_loop() {
105211
post_body="${post_body/TIMESTAMP/$ts_ns}"
106212
post_body="${post_body/MESSAGE/$msg}"
107213

108-
if ! curl -fs -X POST -H "Content-type: application/json" -H "Authorization: Basic ${LOKI_AUTH_HEADER}" -d "${post_body}" "${LOKI_PUSH_URL}"; then
214+
if ! "${_CURL_CMD[@]}" -d "${post_body}" "${LOKI_PUSH_URL}"; then
109215
echo "POST FAILED: '${post_body}'"
110216
fi
111217

112218
MIN_TIMESTAMP="${ts_ns}"
113-
done <${PIPE_NAME}
219+
done <"${PIPE_NAME}"
114220
}
115221

116222
_setup
117223

118224
MIN_TIMESTAMP=0
119225

120226
if [ "${BOOT}" -eq 1 ]; then
121-
${LOGREAD} -t >${BULK_DATA}
122-
last_line="$(tail -1 ${BULK_DATA})"
227+
${LOGREAD} -t >"${BULK_DATA}"
228+
last_line="$(tail -1 "${BULK_DATA}")"
123229
ts="${last_line:26:14}"
124230
ts_ms="${ts/./}"
125231

@@ -131,7 +237,8 @@ if [ "${BOOT}" -eq 1 ]; then
131237
MIN_TIMESTAMP=${ts_ns}
132238
fi
133239

134-
_do_bulk_post
240+
_check_for_skewed_timestamp "${BULK_DATA}"
241+
_do_bulk_post "${BULK_DATA}"
135242
fi
136243

137244
trap "_teardown" SIGINT SIGTERM EXIT

tests/create_timeshifted_log.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Helper script to create log with timestamp skew emulation
4+
"""
5+
6+
import sys
7+
import random
8+
from logread import MockLogRead
9+
10+
11+
def main():
12+
"""
13+
Entrypoint
14+
"""
15+
mlr = MockLogRead()
16+
mlr.parse_args()
17+
mlr.load_lines()
18+
19+
num_lines = len(mlr.lines)
20+
sync_from_line = random.randrange(
21+
(num_lines // 3) * 2, (num_lines // 3) * 2 + (num_lines // 3) // 2
22+
)
23+
shift_back = (180 * 86400) + (random.randrange(1, 30) * 86400)
24+
25+
# print(f"loaded {num_lines} lines; making {sync_from_line} the 1st one with synced time")
26+
27+
for nl, line in enumerate(mlr.lines):
28+
if nl < sync_from_line:
29+
msg = mlr.get_msg_from_line(line)
30+
ts = mlr.get_ts_existing(line)
31+
ts -= shift_back
32+
tstr = mlr.datetime_str_from_ts(ts)
33+
print(f"{tstr} [{ts:.03f}] {msg}")
34+
continue
35+
print(line)
36+
37+
return 0
38+
39+
40+
if __name__ == "__main__":
41+
sys.exit(main())

tests/logread.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def __init__(self):
2121
self.lines = []
2222
self.args = None
2323
self.ts_first_line = None
24+
self.ts_last_line = None
2425
self.ts_start_from = None
2526
self.ts_apply_delta = None
2627

@@ -142,6 +143,7 @@ def load_lines(self):
142143
self.lines = [line.rstrip() for line in file.readlines()]
143144

144145
self.ts_first_line = self.get_adjusted_ts(self.lines[0])
146+
self.ts_last_line = self.get_adjusted_ts(self.lines[-1])
145147
state_file = self.args.log_file + ".state"
146148

147149
if os.path.isfile(state_file):
@@ -152,7 +154,8 @@ def load_lines(self):
152154
with open(state_file, "w", encoding="utf-8") as file:
153155
file.write(f"{self.ts_start_from.timestamp()}\n")
154156

155-
self.ts_apply_delta = self.ts_start_from.timestamp() - self.ts_first_line
157+
# for timeshift tests we assume that last line always contains synchronized timestamp
158+
self.ts_apply_delta = self.ts_start_from.timestamp() - self.ts_last_line
156159

157160
def print_line(self, line: str):
158161
"""

0 commit comments

Comments
 (0)