Skip to content

Commit c9ce3e0

Browse files
authored
Fix OOM aborts in large-memory ASAN tests on GitHub runners (valkey-io#3263)
Carries on from where valkey-io#3161 left off. The test-sanitizer-address-large-memory jobs were being OOM-killed on GitHub-hosted runners (15.6GB RAM) due to ASAN's 2-3x memory overhead. Changes: - Skip 4GB quicklist compression test under ASAN (requires ~16-24GB with dual buffers + ASAN overhead) - Reduce integration test sizes from 5GB to 4.1GB (preserves >4GB 32-bit boundary coverage) - Reduce XADD iterations from 10 to 3 - Add memory monitoring to track minimum free memory during CI runs Signed-off-by: Rain Valentine <rsg000@gmail.com>
1 parent 5133023 commit c9ce3e0

File tree

5 files changed

+73
-11
lines changed

5 files changed

+73
-11
lines changed

.github/workflows/daily.yml

Lines changed: 52 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -965,6 +965,9 @@ jobs:
965965
- name: unittest
966966
if: true && !contains(github.event.inputs.skiptests, 'unittest')
967967
run: ./src/unit/valkey-unit-gtests
968+
# Large-memory tests with sanitizers require 10-14GB RAM due to ASAN/UBSAN overhead.
969+
# GitHub-hosted runners for public repos provide 16GB (ubuntu-latest).
970+
# These tests are borderline - monitoring memory usage to determine if they can run reliably.
968971
test-sanitizer-address-large-memory:
969972
runs-on: ubuntu-latest
970973
if: |
@@ -990,6 +993,8 @@ jobs:
990993
echo "skiptests: ${{github.event.inputs.skiptests}}"
991994
echo "test_args: ${{github.event.inputs.test_args}}"
992995
echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}"
996+
- name: Log runner memory
997+
run: free -h
993998
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
994999
with:
9951000
repository: ${{ ((github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call') && (inputs.use_repo || github.event.inputs.use_repo)) || github.repository }}
@@ -1002,15 +1007,34 @@ jobs:
10021007
run: |
10031008
sudo apt-get update
10041009
sudo apt-get install tcl8.6 tclx -y
1010+
- name: Start memory monitor
1011+
run: |
1012+
# Track minimum free memory to detect OOM risk
1013+
(while true; do
1014+
FREE=$(awk '/MemAvailable/ {print $2}' /proc/meminfo)
1015+
echo "$FREE" >> /tmp/memfree.log
1016+
sleep 5
1017+
done) &
1018+
echo $! > /tmp/memmon.pid
10051019
- name: unittest
10061020
if: true && !contains(github.event.inputs.skiptests, 'unittest')
10071021
run: ./src/unit/valkey-unit-gtests --large-memory
10081022
- name: large memory tests
10091023
if: true && !contains(github.event.inputs.skiptests, 'valkey')
1010-
run: ./runtest --accurate --verbose --dump-logs --clients 5 --large-memory --tags large-memory ${{github.event.inputs.test_args}}
1024+
run: ./runtest --accurate --verbose --dump-logs --clients 1 --large-memory --tags large-memory ${{github.event.inputs.test_args}}
10111025
- name: large memory module api tests
10121026
if: true && !contains(github.event.inputs.skiptests, 'modules')
1013-
run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs --clients 5 --large-memory --tags large-memory ${{github.event.inputs.test_args}}
1027+
run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs --clients 1 --large-memory --tags large-memory ${{github.event.inputs.test_args}}
1028+
- name: Memory usage summary
1029+
if: always()
1030+
run: |
1031+
kill $(cat /tmp/memmon.pid) 2>/dev/null || true
1032+
echo "=== Memory Summary ==="
1033+
printf "Total RAM: %.1fGB\n" $(awk '/MemTotal/ {print $2/1024/1024}' /proc/meminfo)
1034+
if [ -f /tmp/memfree.log ]; then
1035+
MIN_FREE=$(sort -n /tmp/memfree.log | head -1)
1036+
printf "Minimum free memory: %.1fGB\n" $(echo "$MIN_FREE/1024/1024" | bc -l)
1037+
fi
10141038
test-sanitizer-undefined:
10151039
runs-on: ubuntu-latest
10161040
if: |
@@ -1069,6 +1093,9 @@ jobs:
10691093
- name: unittest
10701094
if: true && !contains(github.event.inputs.skiptests, 'unittest')
10711095
run: ./src/unit/valkey-unit-gtests --accurate
1096+
# Large-memory tests with sanitizers require 10-14GB RAM due to ASAN/UBSAN overhead.
1097+
# GitHub-hosted runners for public repos provide 16GB (ubuntu-latest).
1098+
# These tests are borderline - monitoring memory usage to determine if they can run reliably.
10721099
test-sanitizer-undefined-large-memory:
10731100
runs-on: ubuntu-latest
10741101
if: |
@@ -1094,6 +1121,8 @@ jobs:
10941121
echo "skiptests: ${{github.event.inputs.skiptests}}"
10951122
echo "test_args: ${{github.event.inputs.test_args}}"
10961123
echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}"
1124+
- name: Log runner memory
1125+
run: free -h
10971126
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
10981127
with:
10991128
repository: ${{ ((github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call') && (inputs.use_repo || github.event.inputs.use_repo)) || github.repository }}
@@ -1106,15 +1135,34 @@ jobs:
11061135
run: |
11071136
sudo apt-get update
11081137
sudo apt-get install tcl8.6 tclx -y
1138+
- name: Start memory monitor
1139+
run: |
1140+
# Track minimum free memory to detect OOM risk
1141+
(while true; do
1142+
FREE=$(awk '/MemAvailable/ {print $2}' /proc/meminfo)
1143+
echo "$FREE" >> /tmp/memfree.log
1144+
sleep 5
1145+
done) &
1146+
echo $! > /tmp/memmon.pid
11091147
- name: unittest
11101148
if: true && !contains(github.event.inputs.skiptests, 'unittest')
11111149
run: ./src/unit/valkey-unit-gtests --accurate --large-memory
11121150
- name: large memory tests
11131151
if: true && !contains(github.event.inputs.skiptests, 'valkey')
1114-
run: ./runtest --accurate --verbose --dump-logs --clients 5 --large-memory --tags large-memory ${{github.event.inputs.test_args}}
1152+
run: ./runtest --accurate --verbose --dump-logs --clients 1 --large-memory --tags large-memory ${{github.event.inputs.test_args}}
11151153
- name: large memory module api tests
11161154
if: true && !contains(github.event.inputs.skiptests, 'modules')
1117-
run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs --clients 5 --large-memory --tags large-memory ${{github.event.inputs.test_args}}
1155+
run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs --clients 1 --large-memory --tags large-memory ${{github.event.inputs.test_args}}
1156+
- name: Memory usage summary
1157+
if: always()
1158+
run: |
1159+
kill $(cat /tmp/memmon.pid) 2>/dev/null || true
1160+
echo "=== Memory Summary ==="
1161+
printf "Total RAM: %.1fGB\n" $(awk '/MemTotal/ {print $2/1024/1024}' /proc/meminfo)
1162+
if [ -f /tmp/memfree.log ]; then
1163+
MIN_FREE=$(sort -n /tmp/memfree.log | head -1)
1164+
printf "Minimum free memory: %.1fGB\n" $(echo "$MIN_FREE/1024/1024" | bc -l)
1165+
fi
11181166
test-sanitizer-force-defrag:
11191167
runs-on: ubuntu-latest
11201168
if: |

src/unit/test_quicklist.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1765,6 +1765,13 @@ TEST_F(QuicklistTest, quicklistCompressAndDecompressQuicklistListpackNode) {
17651765
TEST_F(QuicklistTest, quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX) {
17661766
if (!large_memory) GTEST_SKIP() << "Skipping large memory test";
17671767

1768+
#ifdef VALKEY_ADDRESS_SANITIZER
1769+
/* Skip under ASAN: compression requires both original (4GB) and output
1770+
* buffer (~4GB) simultaneously, totaling ~8GB. With ASAN's 2-3x memory
1771+
* overhead, peak usage reaches ~16-24GB, exceeding GitHub runner limits. */
1772+
GTEST_SKIP() << "Skipping large memory test under address sanitizer";
1773+
#endif
1774+
17681775
#if ULONG_MAX >= 0xffffffffffffffff
17691776

17701777
size_t sz = (1ull << 32);

tests/unit/type/list.tcl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,9 @@ catch {
291291
}
292292
if {[lindex [r config get proto-max-bulk-len] 1] == 10000000000} {
293293

294-
set str_length 5000000000
294+
# Reduced from 5GB to fit in 16GB CI runners with ASAN overhead
295+
# Must exceed 2^32 (4294967296) to test >4GiB (32-bit boundary) behavior
296+
set str_length 4300000000
295297

296298
# repeating all the plain nodes basic checks with 5gb values
297299
test {Test LPUSH and LPOP on plain nodes over 4GB} {

tests/unit/type/set.tcl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1178,7 +1178,9 @@ catch {
11781178
}
11791179
if {[lindex [r config get proto-max-bulk-len] 1] == 10000000000} {
11801180

1181-
set str_length 4400000000 ;#~4.4GB
1181+
# Reduced from 4.4GB to fit in 16GB CI runners with ASAN overhead
1182+
# Must exceed 2^32 (4294967296) to test >4GiB (32-bit boundary) behavior
1183+
set str_length 4300000000 ;#~4GiB, >2^32
11821184

11831185
test {SADD, SCARD, SISMEMBER - large data} {
11841186
r flushdb

tests/unit/violations.tcl

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
# One XADD with one huge 5GB field
1+
# One XADD with one huge >4GiB field (reduced from 5GB for CI memory limits)
2+
# Must exceed 2^32 to require more than 32 bits to address
23
# Expected to fail resulting in an empty stream
34
run_solo {violations} {
45
start_server [list overrides [list save ""] tags {"large-memory"}] {
@@ -8,7 +9,7 @@ start_server [list overrides [list save ""] tags {"large-memory"}] {
89
r write "*5\r\n\$4\r\nXADD\r\n\$2\r\nS1\r\n\$1\r\n*\r\n"
910
r write "\$1\r\nA\r\n"
1011
catch {
11-
write_big_bulk 5000000000 ;#5gb
12+
write_big_bulk 4300000000 ;#~4GiB, >2^32
1213
} err
1314
assert_match {*too large*} $err
1415
r xlen S1
@@ -33,15 +34,16 @@ start_server [list overrides [list save ""] tags {"large-memory"}] {
3334
}
3435

3536
# Gradually add big stream fields using repeated XADD calls
37+
# Reduced from 10 to 3 iterations to fit in 16GB CI runners with ASAN overhead
3638
start_server [list overrides [list save ""] tags {"large-memory"}] {
3739
test {several XADD big fields} {
3840
r config set stream-node-max-bytes 0
39-
for {set j 0} {$j<10} {incr j} {
41+
for {set j 0} {$j<3} {incr j} {
4042
r xadd stream * 1 $::str500 2 $::str500
4143
}
4244
r ping
4345
r xlen stream
44-
} {10}
46+
} {3}
4547
}
4648

4749
# Add over 4GB to a single stream listpack (one XADD command)
@@ -75,14 +77,15 @@ start_server [list overrides [list save ""] tags {"large-memory"}] {
7577

7678
# Add over 4GB to a single hash field (one HSET command)
7779
# Object will be converted to hashtable encoding
80+
# Reduced from 5GB; must exceed 2^32 to test >4GiB (32-bit boundary) behavior
7881
start_server [list overrides [list save ""] tags {"large-memory"}] {
7982
test {hash with one huge field} {
8083
catch {r config set hash-max-ziplist-value 10000000000} ;#10gb
8184
r config set proto-max-bulk-len 10000000000 ;#10gb
8285
r config set client-query-buffer-limit 10000000000 ;#10gb
8386
r write "*4\r\n\$4\r\nHSET\r\n\$2\r\nH1\r\n"
8487
r write "\$1\r\nA\r\n"
85-
write_big_bulk 5000000000 ;#5gb
88+
write_big_bulk 4300000000 ;#~4GiB, >2^32
8689
r object encoding H1
8790
} {hashtable}
8891
}

0 commit comments

Comments
 (0)