Skip to content

Commit 130ab68

Browse files
committed
e2e: relax topology-aware coldstart test
Increase the amount of memory allocated by the test container, in an attempt to reduce test flakyness by reducing the relative effect of "memory usage noise" caused by other system processes. Make the error threshold (acceptable marging of error) of coldstart memory allocation proportional, being 10 percent of the allocated size. Also increase the wait duration of coldstart test to avoid flakyness on slow/heavily loaded systems. (cherry picked from commit 0383c38)
1 parent 3c4c6c5 commit 130ab68

File tree

4 files changed

+29
-28
lines changed

4 files changed

+29
-28
lines changed

test/e2e/policies.test-suite/topology-aware/c4pmem4/test03-coldstart-deprecated-syntax/bb-coldstart.yaml.in

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ metadata:
77
${NAME}c0: dram,pmem
88
cri-resource-manager.intel.com/cold-start: |
99
${NAME}c0:
10-
duration: ${DURATION}
10+
duration: ${DURATION_S}s
1111
spec:
1212
containers:
1313
- name: ${NAME}c0
@@ -16,9 +16,9 @@ spec:
1616
command:
1717
- sh
1818
- -c
19-
- 'cold_alloc=\$(dd if=/dev/zero bs=${COLD_ALLOC} count=1 | tr \"\\\0\" \"x\");
19+
- 'cold_alloc=\$(dd if=/dev/zero bs=${COLD_ALLOC_KB}kB count=1 | tr \"\\\0\" \"x\");
2020
sh -c \"paused after cold_alloc \\\$(sleep inf)\";
21-
warm_alloc=\$(dd if=/dev/zero bs=${WARM_ALLOC} count=1 | tr \"\\\0\" \"x\");
21+
warm_alloc=\$(dd if=/dev/zero bs=${WARM_ALLOC_KB}kB count=1 | tr \"\\\0\" \"x\");
2222
sh -c \"paused after warm_alloc \\\$(sleep inf)\";
2323
echo ${NAME}c0 \$(sleep inf); # needed for pod resource discovery'
2424
resources:

test/e2e/policies.test-suite/topology-aware/c4pmem4/test03-coldstart-deprecated-syntax/code.var.sh

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
# Test that a cold-started pod...
22
# 1. is allowed to allocate memory only from PMEM nodes
3-
# during cold period (of length $DURATION).
3+
# during cold period (of length $DURATION_S).
44
# 2. is restricted from the very beginning of pod execution:
55
# immediately allocated memory blob consumes PMEM from expected node.
66
# 3. is allowed to allocate memory from both PMEM and DRAM after
77
# the cold period.
8-
# 4. is no more restricted after $DURATION + 1s has passed in pod:
8+
# 4. is no more restricted after $DURATION_S + 1s has passed in pod:
99
# warm-allocated memory is not taken from PMEM nodes.
1010

1111
PMEM_NODES='{"node4", "node5", "node6", "node7"}'
@@ -22,9 +22,9 @@ CRI_RESMGR_OUTPUT="cat cri-resmgr.output.txt"
2222

2323
PMEM_USED_BEFORE_POD0="$(pmem-used)"
2424

25-
DURATION=10s
26-
COLD_ALLOC=$((10 * 1024))kB
27-
WARM_ALLOC=$((20 * 1024))kB
25+
DURATION_S=10
26+
COLD_ALLOC_KB=$((50 * 1024))
27+
WARM_ALLOC_KB=$((100 * 1024))
2828
MEM=1G
2929
create bb-coldstart
3030

@@ -40,21 +40,21 @@ vm-run-until "pgrep -f '^sh -c paused after cold_alloc'" >/dev/null ||
4040
error "cold memory allocation timed out"
4141

4242
echo "Verify PMEM consumption during cold period."
43-
PMEM_ERROR_MARGIN=1024 # meminfo MemUsed vs dd bytes error margin
43+
# meminfo MemUsed vs dd bytes error margin, use 10%
44+
PMEM_ERROR_MARGIN=$((COLD_ALLOC_KB / 10))
4445
sleep 1
4546
PMEM_USED_COLD_POD0="$(pmem-used)"
4647
PMEM_COLD_CONSUMED=$(( $PMEM_USED_COLD_POD0 - $PMEM_USED_BEFORE_POD0 ))
47-
if (( $PMEM_COLD_CONSUMED + $PMEM_ERROR_MARGIN < ${COLD_ALLOC%kB} )); then
48-
error "pod0 did not allocate $COLD_ALLOC from PMEM. MemUsed PMEM delta: $PMEM_COLD_CONSUMED"
48+
if (( $PMEM_COLD_CONSUMED + $PMEM_ERROR_MARGIN < $COLD_ALLOC_KB )); then
49+
error "pod0 did not allocate ${COLD_ALLOC_KB}kB from PMEM. MemUsed PMEM delta: $PMEM_COLD_CONSUMED"
4950
else
50-
echo "### Verified: PMEM memory consumed during cold period: $PMEM_COLD_CONSUMED kB, pod script allocated: ${COLD_ALLOC%kB} kB"
51+
echo "### Verified: PMEM memory consumed during cold period: $PMEM_COLD_CONSUMED kB, pod script allocated: $COLD_ALLOC_KB kB"
5152
fi
5253

5354
coldstarts=$(vm-command-q "$CRI_RESMGR_OUTPUT | grep 'finishing coldstart period for pod0:pod0c0' | wc -l")
54-
echo "Wait that cri-resmgr finishes coldstart period within 5s + $DURATION."
55-
sleep 5s
56-
vm-run-until --timeout ${DURATION%s} "[ \$($CRI_RESMGR_OUTPUT | grep 'finishing coldstart period for pod0:pod0c0' | wc -l) -gt $coldstarts ]" ||
57-
error "cri-resmgr did not report finishing coldstart period within $DURATION"
55+
echo "Wait that cri-resmgr finishes coldstart period within $(($DURATION_S + 10)) seconds."
56+
vm-run-until --timeout $((DURATION_S + 10)) "[ \$($CRI_RESMGR_OUTPUT | grep 'finishing coldstart period for pod0:pod0c0' | wc -l) -gt $coldstarts ]" ||
57+
error "cri-resmgr did not report finishing coldstart period within $DURATION_S seconds"
5858

5959
vm-command "$CRI_RESMGR_OUTPUT | grep 'pinning to memory 1,7'" ||
6060
error "cri-resmgr did not report pinning to expected memory nodes"
@@ -74,7 +74,7 @@ sleep 1
7474
PMEM_USED_WARM_POD0="$(pmem-used)"
7575
PMEM_WARM_CONSUMED=$(( $PMEM_USED_WARM_POD0 - $PMEM_USED_COLD_POD0 ))
7676
if (( $PMEM_WARM_CONSUMED > 0 )); then
77-
echo "### Verify (soft) failed: pod0 allocated $WARM_ALLOC from PMEM. Should have been taken from DRAM."
77+
echo "### Verify (soft) failed: pod0 allocated $WARM_ALLOC_KB kB from PMEM. Should have been taken from DRAM."
7878
else
79-
echo "### Verified (soft): PMEM memory consumption delta during warm period: $PMEM_WARM_CONSUMED kB, pod script allocated: ${WARM_ALLOC%kB} kB"
79+
echo "### Verified (soft): PMEM memory consumption delta during warm period: $PMEM_WARM_CONSUMED kB, pod script allocated: $WARM_ALLOC_KB kB"
8080
fi

test/e2e/policies.test-suite/topology-aware/c4pmem4/test03-coldstart/bb-coldstart.yaml.in

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ spec:
1414
command:
1515
- sh
1616
- -c
17-
- 'cold_alloc=\$(dd if=/dev/zero bs=${COLD_ALLOC} count=1 | tr \"\\\0\" \"x\");
17+
- 'cold_alloc=\$(dd if=/dev/zero bs=${COLD_ALLOC_KB}kB count=1 | tr \"\\\0\" \"x\");
1818
sh -c \"paused after cold_alloc \\\$(sleep inf)\";
19-
warm_alloc=\$(dd if=/dev/zero bs=${WARM_ALLOC} count=1 | tr \"\\\0\" \"x\");
19+
warm_alloc=\$(dd if=/dev/zero bs=${WARM_ALLOC_KB}kB count=1 | tr \"\\\0\" \"x\");
2020
sh -c \"paused after warm_alloc \\\$(sleep inf)\";
2121
echo ${NAME}c0 \$(sleep inf); # needed for pod resource discovery'
2222
resources:

test/e2e/policies.test-suite/topology-aware/c4pmem4/test03-coldstart/code.var.sh

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ CRI_RESMGR_OUTPUT="cat cri-resmgr.output.txt"
2323
PMEM_USED_BEFORE_POD0="$(pmem-used)"
2424

2525
DURATION=10s
26-
COLD_ALLOC=$((10 * 1024))kB
27-
WARM_ALLOC=$((20 * 1024))kB
26+
COLD_ALLOC_KB=$((50 * 1024))
27+
WARM_ALLOC_KB=$((100 * 1024))
2828
MEM=1G
2929
create bb-coldstart
3030

@@ -40,14 +40,15 @@ vm-run-until "pgrep -f '^sh -c paused after cold_alloc'" >/dev/null ||
4040
error "cold memory allocation timed out"
4141

4242
echo "Verify PMEM consumption during cold period."
43-
PMEM_ERROR_MARGIN=1024 # meminfo MemUsed vs dd bytes error margin
43+
# meminfo MemUsed vs dd bytes error margin, use 10%
44+
PMEM_ERROR_MARGIN=$((COLD_ALLOC_KB / 10))
4445
sleep 1
4546
PMEM_USED_COLD_POD0="$(pmem-used)"
4647
PMEM_COLD_CONSUMED=$(( $PMEM_USED_COLD_POD0 - $PMEM_USED_BEFORE_POD0 ))
47-
if (( $PMEM_COLD_CONSUMED + $PMEM_ERROR_MARGIN < ${COLD_ALLOC%kB} )); then
48-
error "pod0 did not allocate $COLD_ALLOC from PMEM. MemUsed PMEM delta: $PMEM_COLD_CONSUMED"
48+
if (( $PMEM_COLD_CONSUMED + $PMEM_ERROR_MARGIN < $COLD_ALLOC_KB )); then
49+
error "pod0 did not allocate ${COLD_ALLOC_KB}kB from PMEM. MemUsed PMEM delta: $PMEM_COLD_CONSUMED"
4950
else
50-
echo "### Verified: PMEM memory consumed during cold period: $PMEM_COLD_CONSUMED kB, pod script allocated: ${COLD_ALLOC%kB} kB"
51+
echo "### Verified: PMEM memory consumed during cold period: $PMEM_COLD_CONSUMED kB, pod script allocated: $COLD_ALLOC_KB kB"
5152
fi
5253

5354
coldstarts=$(vm-command-q "$CRI_RESMGR_OUTPUT | grep 'finishing coldstart period for pod0:pod0c0' | wc -l")
@@ -74,7 +75,7 @@ sleep 1
7475
PMEM_USED_WARM_POD0="$(pmem-used)"
7576
PMEM_WARM_CONSUMED=$(( $PMEM_USED_WARM_POD0 - $PMEM_USED_COLD_POD0 ))
7677
if (( $PMEM_WARM_CONSUMED > 0 )); then
77-
echo "### Verify (soft) failed: pod0 allocated $WARM_ALLOC from PMEM. Should have been taken from DRAM."
78+
echo "### Verify (soft) failed: pod0 allocated $WARM_ALLOC_KB kB from PMEM. Should have been taken from DRAM."
7879
else
79-
echo "### Verified (soft): PMEM memory consumption delta during warm period: $PMEM_WARM_CONSUMED kB, pod script allocated: ${WARM_ALLOC%kB} kB"
80+
echo "### Verified (soft): PMEM memory consumption delta during warm period: $PMEM_WARM_CONSUMED kB, pod script allocated: $WARM_ALLOC_KB kB"
8081
fi

0 commit comments

Comments
 (0)