Skip to content

Commit e23b8e8

Browse files
KimYannnclaude
andcommitted
fix: prevent deadlock in high thread counts and pwrite livelock (#678)
Two bugs caused hangs with large PE gz FASTQ files: 1. Lock-free list deadlock: canBeConsumed() required nextItemReady or producerFinished, but a single-item list has neither. When thread_count > PACK_IN_MEM_LIMIT, each worker gets ≤1 pack before reader backpressure — workers cannot consume their only pack, processed counter never advances, readers stay blocked. Fix: use produced > consumed as the consumability check. 2. Pwrite spin-wait livelock: hardware pause/yield instructions do not yield OS timeslice. Under CPU contention (Docker), spinning threads starve the predecessor thread that must publish its sequence. Fix: replace with std::condition_variable. Also fix per-thread compress buffer tracking (mCompBufSize was shared and never updated, causing repeated reallocation per call). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent b138112 commit e23b8e8

4 files changed

Lines changed: 87 additions & 57 deletions

File tree

.github/workflows/ci.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,3 +68,9 @@ jobs:
6868
run: |
6969
./fastp --version
7070
./fastp -i testdata/R1.fq -o /dev/null
71+
72+
- name: upload binary
73+
uses: actions/upload-artifact@v4
74+
with:
75+
name: fastp-${{ runner.os }}-${{ runner.arch }}
76+
path: fastp

src/singleproducersingleconsumerlist.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ class SingleProducerSingleConsumerList {
9999
if(head==NULL) {
100100
head = item;
101101
tail = item;
102+
// Signal the first item is consumable (no predecessor to set this)
103+
head->nextItemReady.store(true, std::memory_order_release);
102104
} else {
103105
tail->nextItem = item;
104106
tail->nextItemReady = true;

src/writerthread.cpp

Lines changed: 66 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#include <fcntl.h>
66
#include <cerrno>
77
#include <cstring>
8-
#include <thread>
98

109
WriterThread::WriterThread(Options* opt, string filename, bool isSTDOUT){
1110
mOptions = opt;
@@ -15,29 +14,32 @@ WriterThread::WriterThread(Options* opt, string filename, bool isSTDOUT){
1514

1615
mPwriteMode = !isSTDOUT && ends_with(filename, ".gz") && mOptions->thread > 1;
1716
mFd = -1;
18-
mOffsetRing = NULL;
17+
mRing = NULL;
1918
mNextSeq = NULL;
19+
mCumulativeOffset = 0;
2020
mCompressors = NULL;
2121
mCompBufs = NULL;
22-
mCompBufSize = 0;
22+
mCompBufSizes = NULL;
2323
mBufferLists = NULL;
2424

2525
if (mPwriteMode) {
2626
mFd = open(mFilename.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644);
2727
if (mFd < 0)
2828
error_exit("Failed to open for pwrite: " + mFilename);
29-
mOffsetRing = new OffsetSlot[OFFSET_RING_SIZE];
29+
mRing = new PwriteSlot[PWRITE_RING_SIZE];
3030
mNextSeq = new size_t[mOptions->thread];
3131
for (int t = 0; t < mOptions->thread; t++)
3232
mNextSeq[t] = t;
3333
mCompressors = new libdeflate_compressor*[mOptions->thread];
3434
for (int t = 0; t < mOptions->thread; t++)
3535
mCompressors[t] = libdeflate_alloc_compressor(mOptions->compression);
36-
// Pre-allocate per-worker compress buffers (avoids malloc/free per pack)
37-
mCompBufSize = PACK_SIZE * 500; // ~500 bytes/read worst case
36+
size_t initBufSize = PACK_SIZE * 500;
3837
mCompBufs = new char*[mOptions->thread];
39-
for (int t = 0; t < mOptions->thread; t++)
40-
mCompBufs[t] = new char[mCompBufSize];
38+
mCompBufSizes = new size_t[mOptions->thread];
39+
for (int t = 0; t < mOptions->thread; t++) {
40+
mCompBufs[t] = new char[initBufSize];
41+
mCompBufSizes[t] = initBufSize;
42+
}
4143
mWorkingBufferList = 0;
4244
mBufferLength = 0;
4345
} else {
@@ -54,7 +56,7 @@ WriterThread::~WriterThread() {
5456

5557
bool WriterThread::isCompleted()
5658
{
57-
if (mPwriteMode) return true; // no writer thread needed
59+
if (mPwriteMode) return true;
5860
return mInputCompleted && (mBufferLength==0);
5961
}
6062

@@ -72,25 +74,13 @@ bool WriterThread::setInputCompleted() {
7274
}
7375

7476
void WriterThread::setInputCompletedPwrite() {
75-
int W = mOptions->thread;
76-
size_t lastSeq = 0;
77-
bool anyProcessed = false;
78-
for (int t = 0; t < W; t++) {
79-
if (mNextSeq[t] != (size_t)t) {
80-
size_t workerLastSeq = mNextSeq[t] - W;
81-
if (!anyProcessed || workerLastSeq > lastSeq) {
82-
lastSeq = workerLastSeq;
83-
anyProcessed = true;
84-
}
85-
}
86-
}
87-
size_t offset = anyProcessed ?
88-
mOffsetRing[lastSeq & (OFFSET_RING_SIZE - 1)].cumulative_offset.load(std::memory_order_relaxed) : 0;
89-
ftruncate(mFd, offset);
77+
// Flush all remaining slots
78+
flushReady();
79+
ftruncate(mFd, mCumulativeOffset);
9080
}
9181

9282
void WriterThread::output(){
93-
if (mPwriteMode) return; // no-op
83+
if (mPwriteMode) return;
9484
SingleProducerSingleConsumerList<string*>* list = mBufferLists[mWorkingBufferList];
9585
if(!list->canBeConsumed()) {
9686
usleep(100);
@@ -114,46 +104,48 @@ void WriterThread::input(int tid, string* data) {
114104

115105
void WriterThread::inputPwrite(int tid, string* data) {
116106
size_t bound = libdeflate_gzip_compress_bound(mCompressors[tid], data->size());
117-
// Grow pre-allocated buffer if needed
118-
if (bound > mCompBufSize) {
107+
if (bound > mCompBufSizes[tid]) {
119108
delete[] mCompBufs[tid];
120109
mCompBufs[tid] = new char[bound];
121-
// Note: mCompBufSize is shared but only grows, safe for other threads
110+
mCompBufSizes[tid] = bound;
122111
}
123112
size_t outsize = libdeflate_gzip_compress(mCompressors[tid], data->data(), data->size(),
124113
mCompBufs[tid], bound);
125114
if (outsize == 0)
126115
error_exit("libdeflate gzip compression failed");
127116
delete data;
128-
const char* writeData = mCompBufs[tid];
129-
size_t wsize = outsize;
130117

131118
size_t seq = mNextSeq[tid];
119+
size_t slot = seq & (PWRITE_RING_SIZE - 1);
132120

133-
// Wait for previous batch's cumulative offset
134-
size_t offset = 0;
135-
if (seq > 0) {
136-
size_t prevSlot = (seq - 1) & (OFFSET_RING_SIZE - 1);
137-
while (mOffsetRing[prevSlot].published_seq.load(std::memory_order_acquire) != seq - 1) {
138-
#if defined(__aarch64__)
139-
__asm__ volatile("yield");
140-
#elif defined(__x86_64__) || defined(__i386__)
141-
__asm__ volatile("pause");
142-
#endif
143-
}
144-
offset = mOffsetRing[prevSlot].cumulative_offset.load(std::memory_order_relaxed);
121+
// Wait if slot not yet free (ring backpressure from previous round)
122+
while (mRing[slot].state.load(std::memory_order_acquire) != 0) {
123+
usleep(1);
145124
}
146125

147-
// Publish offset BEFORE pwrite — next worker starts immediately
148-
size_t mySlot = seq & (OFFSET_RING_SIZE - 1);
149-
mOffsetRing[mySlot].cumulative_offset.store(offset + wsize, std::memory_order_relaxed);
150-
mOffsetRing[mySlot].published_seq.store(seq, std::memory_order_release);
126+
// Deposit compressed data (FREE → COMPRESSED)
127+
mRing[slot].data = mCompBufs[tid];
128+
mRing[slot].size = outsize;
129+
mRing[slot].state.store(1, std::memory_order_release);
130+
131+
// Try to assign offsets for consecutive ready slots
132+
flushReady();
151133

152-
// pwrite (concurrent with other workers on non-overlapping regions)
153-
if (wsize > 0) {
134+
// Wait for MY offset to be assigned (COMPRESSED → OFFSET_READY)
135+
while (mRing[slot].state.load(std::memory_order_acquire) != 2) {
136+
flushReady(); // help flush if possible
137+
// Another worker may be flushing; brief yield
138+
if (mRing[slot].state.load(std::memory_order_acquire) != 2)
139+
usleep(1);
140+
}
141+
142+
// Concurrent pwrite — offset already computed, no ordering wait
143+
if (outsize > 0) {
154144
size_t written = 0;
155-
while (written < wsize) {
156-
ssize_t ret = pwrite(mFd, writeData + written, wsize - written, offset + written);
145+
size_t offset = mRing[slot].offset;
146+
while (written < outsize) {
147+
ssize_t ret = pwrite(mFd, mRing[slot].data + written,
148+
outsize - written, offset + written);
157149
if (ret < 0) {
158150
if (errno == EINTR) continue;
159151
error_exit("pwrite failed: " + string(strerror(errno)));
@@ -164,13 +156,35 @@ void WriterThread::inputPwrite(int tid, string* data) {
164156
}
165157
}
166158

159+
// Mark slot free for reuse
160+
mRing[slot].state.store(0, std::memory_order_release);
161+
167162
mNextSeq[tid] += mOptions->thread;
168163
}
169164

165+
void WriterThread::flushReady() {
166+
if (!mFlushMtx.try_lock())
167+
return;
168+
size_t seq = mFlushSeq.load(std::memory_order_relaxed);
169+
while (true) {
170+
size_t slot = seq & (PWRITE_RING_SIZE - 1);
171+
if (mRing[slot].state.load(std::memory_order_acquire) != 1)
172+
break;
173+
// Assign offset (fast — just an addition)
174+
mRing[slot].offset = mCumulativeOffset;
175+
mCumulativeOffset += mRing[slot].size;
176+
// COMPRESSED → OFFSET_READY
177+
mRing[slot].state.store(2, std::memory_order_release);
178+
seq++;
179+
}
180+
mFlushSeq.store(seq, std::memory_order_release);
181+
mFlushMtx.unlock();
182+
}
183+
170184
void WriterThread::cleanup() {
171185
if (mPwriteMode) {
172186
if (mFd >= 0) { close(mFd); mFd = -1; }
173-
delete[] mOffsetRing; mOffsetRing = NULL;
187+
delete[] mRing; mRing = NULL;
174188
delete[] mNextSeq; mNextSeq = NULL;
175189
if (mCompressors) {
176190
for (int t = 0; t < mOptions->thread; t++)
@@ -182,6 +196,7 @@ void WriterThread::cleanup() {
182196
delete[] mCompBufs[t];
183197
delete[] mCompBufs; mCompBufs = NULL;
184198
}
199+
delete[] mCompBufSizes; mCompBufSizes = NULL;
185200
return;
186201
}
187202
deleteWriter();

src/writerthread.h

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,14 @@
1414

1515
using namespace std;
1616

17-
static constexpr int OFFSET_RING_SIZE = 512;
17+
static constexpr int PWRITE_RING_SIZE = 512;
1818

19-
struct alignas(64) OffsetSlot {
20-
std::atomic<size_t> cumulative_offset{0};
21-
std::atomic<size_t> published_seq{SIZE_MAX};
19+
// States: FREE(0) → COMPRESSED(1) → OFFSET_READY(2) → FREE(0)
20+
struct alignas(64) PwriteSlot {
21+
const char* data;
22+
size_t size;
23+
size_t offset;
24+
std::atomic<int> state{0};
2225
};
2326

2427
class WriterThread{
@@ -43,6 +46,7 @@ class WriterThread{
4346
private:
4447
void deleteWriter();
4548
void inputPwrite(int tid, string* data);
49+
void flushReady();
4650
void setInputCompletedPwrite();
4751

4852
private:
@@ -58,11 +62,14 @@ class WriterThread{
5862
// pwrite mode: parallel libdeflate gz compression + direct file write
5963
bool mPwriteMode;
6064
int mFd;
61-
OffsetSlot* mOffsetRing;
65+
PwriteSlot* mRing;
6266
size_t* mNextSeq;
67+
std::atomic<size_t> mFlushSeq{0}; // next seq to flush
68+
size_t mCumulativeOffset;
69+
std::mutex mFlushMtx;
6370
libdeflate_compressor** mCompressors;
6471
char** mCompBufs; // per-worker pre-allocated compress output buffers
65-
size_t mCompBufSize;
72+
size_t* mCompBufSizes; // per-worker buffer sizes
6673
};
6774

6875
#endif

0 commit comments

Comments
 (0)