Fix OOM aborts in large-memory ASAN tests on GitHub runners (valkey-io#3263)

rainsupreme · web-flow · commit c9ce3e091913 · 2026-03-12T12:33:33.000+08:00
Carries on from where valkey-io#3161 left off. The test-sanitizer-address-large-memory jobs were being OOM-killed on GitHub-hosted runners (15.6GB RAM) due to ASAN's 2-3x memory overhead. Changes: - Skip 4GB quicklist compression test under ASAN (requires ~16-24GB with dual buffers + ASAN overhead) - Reduce integration test sizes from 5GB to 4.1GB (preserves >4GB 32-bit boundary coverage) - Reduce XADD iterations from 10 to 3 - Add memory monitoring to track minimum free memory during CI runs Signed-off-by: Rain Valentine <rsg000@gmail.com>
diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml
@@ -965,6 +965,9 @@ jobs:
       - name: unittest
         if: true && !contains(github.event.inputs.skiptests, 'unittest')
         run: ./src/unit/valkey-unit-gtests
+  # Large-memory tests with sanitizers require 10-14GB RAM due to ASAN/UBSAN overhead.
+  # GitHub-hosted runners for public repos provide 16GB (ubuntu-latest).
+  # These tests are borderline - monitoring memory usage to determine if they can run reliably.
   test-sanitizer-address-large-memory:
     runs-on: ubuntu-latest
     if: |
@@ -990,6 +993,8 @@ jobs:
           echo "skiptests: ${{github.event.inputs.skiptests}}"
           echo "test_args: ${{github.event.inputs.test_args}}"
           echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}"
+      - name: Log runner memory
+        run: free -h
       - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
         with:
           repository: ${{ ((github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call') && (inputs.use_repo || github.event.inputs.use_repo)) || github.repository }}
@@ -1002,15 +1007,34 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install tcl8.6 tclx -y
+      - name: Start memory monitor
+        run: |
+          # Track minimum free memory to detect OOM risk
+          (while true; do
+            FREE=$(awk '/MemAvailable/ {print $2}' /proc/meminfo)
+            echo "$FREE" >> /tmp/memfree.log
+            sleep 5
+          done) &
+          echo $! > /tmp/memmon.pid
       - name: unittest
         if: true && !contains(github.event.inputs.skiptests, 'unittest')
         run: ./src/unit/valkey-unit-gtests --large-memory
       - name: large memory tests
         if: true && !contains(github.event.inputs.skiptests, 'valkey')
-        run: ./runtest --accurate --verbose --dump-logs --clients 5 --large-memory --tags large-memory ${{github.event.inputs.test_args}}
+        run: ./runtest --accurate --verbose --dump-logs --clients 1 --large-memory --tags large-memory ${{github.event.inputs.test_args}}
       - name: large memory module api tests
         if: true && !contains(github.event.inputs.skiptests, 'modules')
-        run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs --clients 5 --large-memory --tags large-memory ${{github.event.inputs.test_args}}
+        run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs --clients 1 --large-memory --tags large-memory ${{github.event.inputs.test_args}}
+      - name: Memory usage summary
+        if: always()
+        run: |
+          kill $(cat /tmp/memmon.pid) 2>/dev/null || true
+          echo "=== Memory Summary ==="
+          printf "Total RAM: %.1fGB\n" $(awk '/MemTotal/ {print $2/1024/1024}' /proc/meminfo)
+          if [ -f /tmp/memfree.log ]; then
+            MIN_FREE=$(sort -n /tmp/memfree.log | head -1)
+            printf "Minimum free memory: %.1fGB\n" $(echo "$MIN_FREE/1024/1024" | bc -l)
+          fi
   test-sanitizer-undefined:
     runs-on: ubuntu-latest
     if: |
@@ -1069,6 +1093,9 @@ jobs:
       - name: unittest
         if: true && !contains(github.event.inputs.skiptests, 'unittest')
         run: ./src/unit/valkey-unit-gtests --accurate
+  # Large-memory tests with sanitizers require 10-14GB RAM due to ASAN/UBSAN overhead.
+  # GitHub-hosted runners for public repos provide 16GB (ubuntu-latest).
+  # These tests are borderline - monitoring memory usage to determine if they can run reliably.
   test-sanitizer-undefined-large-memory:
     runs-on: ubuntu-latest
     if: |
@@ -1094,6 +1121,8 @@ jobs:
           echo "skiptests: ${{github.event.inputs.skiptests}}"
           echo "test_args: ${{github.event.inputs.test_args}}"
           echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}"
+      - name: Log runner memory
+        run: free -h
       - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
         with:
           repository: ${{ ((github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call') && (inputs.use_repo || github.event.inputs.use_repo)) || github.repository }}
@@ -1106,15 +1135,34 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install tcl8.6 tclx -y
+      - name: Start memory monitor
+        run: |
+          # Track minimum free memory to detect OOM risk
+          (while true; do
+            FREE=$(awk '/MemAvailable/ {print $2}' /proc/meminfo)
+            echo "$FREE" >> /tmp/memfree.log
+            sleep 5
+          done) &
+          echo $! > /tmp/memmon.pid
       - name: unittest
         if: true && !contains(github.event.inputs.skiptests, 'unittest')
         run: ./src/unit/valkey-unit-gtests --accurate --large-memory
       - name: large memory tests
         if: true && !contains(github.event.inputs.skiptests, 'valkey')
-        run: ./runtest --accurate --verbose --dump-logs --clients 5 --large-memory --tags large-memory ${{github.event.inputs.test_args}}
+        run: ./runtest --accurate --verbose --dump-logs --clients 1 --large-memory --tags large-memory ${{github.event.inputs.test_args}}
       - name: large memory module api tests
         if: true && !contains(github.event.inputs.skiptests, 'modules')
-        run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs --clients 5 --large-memory --tags large-memory ${{github.event.inputs.test_args}}
+        run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs --clients 1 --large-memory --tags large-memory ${{github.event.inputs.test_args}}
+      - name: Memory usage summary
+        if: always()
+        run: |
+          kill $(cat /tmp/memmon.pid) 2>/dev/null || true
+          echo "=== Memory Summary ==="
+          printf "Total RAM: %.1fGB\n" $(awk '/MemTotal/ {print $2/1024/1024}' /proc/meminfo)
+          if [ -f /tmp/memfree.log ]; then
+            MIN_FREE=$(sort -n /tmp/memfree.log | head -1)
+            printf "Minimum free memory: %.1fGB\n" $(echo "$MIN_FREE/1024/1024" | bc -l)
+          fi
   test-sanitizer-force-defrag:
     runs-on: ubuntu-latest
     if: |
diff --git a/src/unit/test_quicklist.cpp b/src/unit/test_quicklist.cpp
@@ -1765,6 +1765,13 @@ TEST_F(QuicklistTest, quicklistCompressAndDecompressQuicklistListpackNode) {
 TEST_F(QuicklistTest, quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX) {
     if (!large_memory) GTEST_SKIP() << "Skipping large memory test";
 
+#ifdef VALKEY_ADDRESS_SANITIZER
+    /* Skip under ASAN: compression requires both original (4GB) and output
+     * buffer (~4GB) simultaneously, totaling ~8GB. With ASAN's 2-3x memory
+     * overhead, peak usage reaches ~16-24GB, exceeding GitHub runner limits. */
+    GTEST_SKIP() << "Skipping large memory test under address sanitizer";
+#endif
+
 #if ULONG_MAX >= 0xffffffffffffffff
 
     size_t sz = (1ull << 32);
diff --git a/tests/unit/type/list.tcl b/tests/unit/type/list.tcl
@@ -291,7 +291,9 @@ catch {
 }
 if {[lindex [r config get proto-max-bulk-len] 1] == 10000000000} {
 
-    set str_length 5000000000
+    # Reduced from 5GB to fit in 16GB CI runners with ASAN overhead
+    # Must exceed 2^32 (4294967296) to test >4GiB (32-bit boundary) behavior
+    set str_length 4300000000
 
     # repeating all the plain nodes basic checks with 5gb values
     test {Test LPUSH and LPOP on plain nodes over 4GB} {
diff --git a/tests/unit/type/set.tcl b/tests/unit/type/set.tcl
@@ -1178,7 +1178,9 @@ catch {
 }
 if {[lindex [r config get proto-max-bulk-len] 1] == 10000000000} {
 
-    set str_length 4400000000 ;#~4.4GB
+    # Reduced from 4.4GB to fit in 16GB CI runners with ASAN overhead
+    # Must exceed 2^32 (4294967296) to test >4GiB (32-bit boundary) behavior
+    set str_length 4300000000 ;#~4GiB, >2^32
 
     test {SADD, SCARD, SISMEMBER - large data} {
         r flushdb
diff --git a/tests/unit/violations.tcl b/tests/unit/violations.tcl
@@ -1,4 +1,5 @@
-# One XADD with one huge 5GB field
+# One XADD with one huge >4GiB field (reduced from 5GB for CI memory limits)
+# Must exceed 2^32 to require more than 32 bits to address
 # Expected to fail resulting in an empty stream
 run_solo {violations} {
 start_server [list overrides [list save ""] tags {"large-memory"}] {
@@ -8,7 +9,7 @@ start_server [list overrides [list save ""] tags {"large-memory"}] {
         r write "*5\r\n\$4\r\nXADD\r\n\$2\r\nS1\r\n\$1\r\n*\r\n"
         r write "\$1\r\nA\r\n"
         catch {
-            write_big_bulk 5000000000 ;#5gb
+            write_big_bulk 4300000000 ;#~4GiB, >2^32
         } err
         assert_match {*too large*} $err
         r xlen S1
@@ -33,15 +34,16 @@ start_server [list overrides [list save ""] tags {"large-memory"}] {
 }
 
 # Gradually add big stream fields using repeated XADD calls
+# Reduced from 10 to 3 iterations to fit in 16GB CI runners with ASAN overhead
 start_server [list overrides [list save ""] tags {"large-memory"}] {
     test {several XADD big fields} {
         r config set stream-node-max-bytes 0
-        for {set j 0} {$j<10} {incr j} {
+        for {set j 0} {$j<3} {incr j} {
             r xadd stream * 1 $::str500 2 $::str500
         }
         r ping
         r xlen stream
-    } {10}
+    } {3}
 }
 
 # Add over 4GB to a single stream listpack (one XADD command)
@@ -75,14 +77,15 @@ start_server [list overrides [list save ""] tags {"large-memory"}] {
 
 # Add over 4GB to a single hash field (one HSET command)
 # Object will be converted to hashtable encoding
+# Reduced from 5GB; must exceed 2^32 to test >4GiB (32-bit boundary) behavior
 start_server [list overrides [list save ""] tags {"large-memory"}] {
     test {hash with one huge field} {
         catch {r config set hash-max-ziplist-value 10000000000} ;#10gb
         r config set proto-max-bulk-len 10000000000 ;#10gb
         r config set client-query-buffer-limit 10000000000 ;#10gb
         r write "*4\r\n\$4\r\nHSET\r\n\$2\r\nH1\r\n"
         r write "\$1\r\nA\r\n"
-        write_big_bulk 5000000000 ;#5gb
+        write_big_bulk 4300000000 ;#~4GiB, >2^32
         r object encoding H1
     } {hashtable}
 }

Original file line number	Diff line number	Diff line change
`@@ -291,7 +291,9 @@ catch {`
`291`	`291`	`}`
`292`	`292`	`if {[lindex [r config get proto-max-bulk-len] 1] == 10000000000} {`
`293`	`293`
`294`		`- set str_length 5000000000`
	`294`	`+ # Reduced from 5GB to fit in 16GB CI runners with ASAN overhead`
	`295`	`+ # Must exceed 2^32 (4294967296) to test >4GiB (32-bit boundary) behavior`
	`296`	`+ set str_length 4300000000`
`295`	`297`
`296`	`298`	`# repeating all the plain nodes basic checks with 5gb values`
`297`	`299`	`test {Test LPUSH and LPOP on plain nodes over 4GB} {`
Original file line number	Diff line number	Diff line change
`@@ -1178,7 +1178,9 @@ catch {`
`1178`	`1178`	`}`
`1179`	`1179`	`if {[lindex [r config get proto-max-bulk-len] 1] == 10000000000} {`
`1180`	`1180`
`1181`		`- set str_length 4400000000 ;#~4.4GB`
	`1181`	`+ # Reduced from 4.4GB to fit in 16GB CI runners with ASAN overhead`
	`1182`	`+ # Must exceed 2^32 (4294967296) to test >4GiB (32-bit boundary) behavior`
	`1183`	`+ set str_length 4300000000 ;#~4GiB, >2^32`
`1182`	`1184`
`1183`	`1185`	`test {SADD, SCARD, SISMEMBER - large data} {`
`1184`	`1186`	`r flushdb`