fix: prevent deadlock in high thread counts and pwrite livelock (#678)

KimYannn · claude · KimYannn · commit ca0c7b98b3bc · 2026-04-08T15:39:48.000+08:00
Two bugs caused hangs with large PE gz FASTQ files:

1. Lock-free list deadlock: canBeConsumed() required nextItemReady or
   producerFinished, but a single-item list has neither. When
   thread_count &gt; PACK_IN_MEM_LIMIT, each worker gets ≤1 pack before
   reader backpressure — workers cannot consume their only pack,
   processed counter never advances, readers stay blocked.
   Fix: use produced &gt; consumed as the consumability check.

2. Pwrite spin-wait livelock: hardware pause/yield instructions do not
   yield OS timeslice. Under CPU contention (Docker), spinning threads
   starve the predecessor thread that must publish its sequence.
   Fix: replace with std::condition_variable.

Also fix per-thread compress buffer tracking (mCompBufSize was shared
and never updated, causing repeated reallocation per call).

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -68,3 +68,9 @@ jobs:
         run: |
           ./fastp --version
           ./fastp -i testdata/R1.fq -o /dev/null
+
+      - name: upload binary
+        uses: actions/upload-artifact@v4
+        with:
+          name: fastp-${{ runner.os }}-${{ runner.arch }}
+          path: fastp
diff --git a/src/singleproducersingleconsumerlist.h b/src/singleproducersingleconsumerlist.h
@@ -92,7 +92,7 @@ class SingleProducerSingleConsumerList {
     inline bool canBeConsumed() {
         if(head == NULL)
             return false;
-        return head->nextItemReady || producerFinished;
+        return produced > consumed;
     }
     inline void produce(T val) {
         LockFreeListItem<T>* item = makeItem(val);
diff --git a/src/writerthread.cpp b/src/writerthread.cpp
@@ -19,7 +19,7 @@ WriterThread::WriterThread(Options* opt, string filename, bool isSTDOUT){
     mNextSeq = NULL;
     mCompressors = NULL;
     mCompBufs = NULL;
-    mCompBufSize = 0;
+    mCompBufSizes = NULL;
     mBufferLists = NULL;
 
     if (mPwriteMode) {
@@ -34,10 +34,13 @@ WriterThread::WriterThread(Options* opt, string filename, bool isSTDOUT){
         for (int t = 0; t < mOptions->thread; t++)
             mCompressors[t] = libdeflate_alloc_compressor(mOptions->compression);
         // Pre-allocate per-worker compress buffers (avoids malloc/free per pack)
-        mCompBufSize = PACK_SIZE * 500;  // ~500 bytes/read worst case
+        size_t initBufSize = PACK_SIZE * 500;  // ~500 bytes/read worst case
         mCompBufs = new char*[mOptions->thread];
-        for (int t = 0; t < mOptions->thread; t++)
-            mCompBufs[t] = new char[mCompBufSize];
+        mCompBufSizes = new size_t[mOptions->thread];
+        for (int t = 0; t < mOptions->thread; t++) {
+            mCompBufs[t] = new char[initBufSize];
+            mCompBufSizes[t] = initBufSize;
+        }
         mWorkingBufferList = 0;
         mBufferLength = 0;
     } else {
@@ -114,11 +117,11 @@ void WriterThread::input(int tid, string* data) {
 
 void WriterThread::inputPwrite(int tid, string* data) {
     size_t bound = libdeflate_gzip_compress_bound(mCompressors[tid], data->size());
-    // Grow pre-allocated buffer if needed
-    if (bound > mCompBufSize) {
+    // Grow per-worker buffer if needed
+    if (bound > mCompBufSizes[tid]) {
         delete[] mCompBufs[tid];
         mCompBufs[tid] = new char[bound];
-        // Note: mCompBufSize is shared but only grows, safe for other threads
+        mCompBufSizes[tid] = bound;
     }
     size_t outsize = libdeflate_gzip_compress(mCompressors[tid], data->data(), data->size(),
                                                mCompBufs[tid], bound);
@@ -130,16 +133,17 @@ void WriterThread::inputPwrite(int tid, string* data) {
 
     size_t seq = mNextSeq[tid];
 
-    // Wait for previous batch's cumulative offset
+    // Wait for previous batch's cumulative offset.
+    // Uses condition variable to avoid priority-inversion livelock when
+    // worker threads outnumber available CPUs (e.g. Docker containers).
     size_t offset = 0;
     if (seq > 0) {
         size_t prevSlot = (seq - 1) & (OFFSET_RING_SIZE - 1);
-        while (mOffsetRing[prevSlot].published_seq.load(std::memory_order_acquire) != seq - 1) {
-#if defined(__aarch64__)
-            __asm__ volatile("yield");
-#elif defined(__x86_64__) || defined(__i386__)
-            __asm__ volatile("pause");
-#endif
+        if (mOffsetRing[prevSlot].published_seq.load(std::memory_order_acquire) != seq - 1) {
+            std::unique_lock<std::mutex> lk(mSeqMtx);
+            mSeqCv.wait(lk, [&]() {
+                return mOffsetRing[prevSlot].published_seq.load(std::memory_order_acquire) == seq - 1;
+            });
         }
         offset = mOffsetRing[prevSlot].cumulative_offset.load(std::memory_order_relaxed);
     }
@@ -148,6 +152,7 @@ void WriterThread::inputPwrite(int tid, string* data) {
     size_t mySlot = seq & (OFFSET_RING_SIZE - 1);
     mOffsetRing[mySlot].cumulative_offset.store(offset + wsize, std::memory_order_relaxed);
     mOffsetRing[mySlot].published_seq.store(seq, std::memory_order_release);
+    mSeqCv.notify_all();
 
     // pwrite (concurrent with other workers on non-overlapping regions)
     if (wsize > 0) {
@@ -182,6 +187,7 @@ void WriterThread::cleanup() {
                 delete[] mCompBufs[t];
             delete[] mCompBufs; mCompBufs = NULL;
         }
+        delete[] mCompBufSizes; mCompBufSizes = NULL;
         return;
     }
     deleteWriter();
diff --git a/src/writerthread.h b/src/writerthread.h
@@ -9,6 +9,7 @@
 #include "options.h"
 #include <atomic>
 #include <mutex>
+#include <condition_variable>
 #include <libdeflate.h>
 #include "singleproducersingleconsumerlist.h"
 
@@ -62,7 +63,9 @@ class WriterThread{
     size_t* mNextSeq;
     libdeflate_compressor** mCompressors;
     char** mCompBufs;       // per-worker pre-allocated compress output buffers
-    size_t mCompBufSize;
+    size_t* mCompBufSizes;  // per-worker buffer sizes
+    std::mutex mSeqMtx;
+    std::condition_variable mSeqCv;
 };
 
 #endif

Original file line number	Diff line number	Diff line change
`@@ -92,7 +92,7 @@ class SingleProducerSingleConsumerList {`
`92`	`92`	`inline bool canBeConsumed() {`
`93`	`93`	`if(head == NULL)`
`94`	`94`	`return false;`
`95`		`- return head->nextItemReady \|\| producerFinished;`
	`95`	`+ return produced > consumed;`
`96`	`96`	`}`
`97`	`97`	`inline void produce(T val) {`
`98`	`98`	`LockFreeListItem<T>* item = makeItem(val);`