Skip to content

Commit cdc3970

Browse files
JanuszLstiepan
authored andcommitted
Fix race condition in the CPU numpy reader (#4848)
- when the last sample in the numpy reader is duplicated due to padding, the reader tries to read the same sample to the same memory, but from different clones of it concurrently. What is more the memory that is read can be allocated and deallocated concurrently. This PR removed this redundant and dangerous work, making sure that each sample is read only once. Signed-off-by: Janusz Lisiecki <[email protected]>
1 parent 17609b8 commit cdc3970

File tree

1 file changed

+18
-19
lines changed

1 file changed

+18
-19
lines changed

dali/operators/reader/numpy_reader_op.cc

+18-19
Original file line numberDiff line numberDiff line change
@@ -252,16 +252,22 @@ void NumpyReaderCPU::Prefetch() {
252252

253253
string previous_path;
254254
for (unsigned idx = 0; idx < curr_batch.size(); ++idx) {
255-
auto &target = curr_batch[idx];
256-
if (target->data.shares_data()) {
257-
target->data.Reset();
255+
// in case of pad_last_batch the curr_batch elements are pointing to the same object
256+
// including the data, so there it no need to read it again or it can even lead to a race
257+
// with allocation/deallocation of memory and concurrent read
258+
if (idx > 0 && curr_batch[idx - 1] == curr_batch[idx]) {
259+
break;
258260
}
261+
auto &target = curr_batch[idx];
262+
259263
// if we pad last batch but we duplicate the samples from the previous one - a case
260264
// with multiple unequal shards where we need to create a full duplicated batch
261-
// so we need to reopen the file and seek
265+
// so there is no need to read again this data
262266
if (!target->current_file) {
263-
target->current_file = FileStream::Open(target->filename, false, false, use_o_direct_);
264-
target->current_file->SeekRead(target->data_offset);
267+
break;
268+
}
269+
if (target->data.shares_data()) {
270+
target->data.Reset();
265271
}
266272
if (use_o_direct_) {
267273
/*
@@ -308,19 +314,12 @@ void NumpyReaderCPU::Prefetch() {
308314
}
309315
target->data.ShareData(tmp_mem, target->nbytes, false, target->shape, target->type, -1);
310316
} else {
311-
if (idx > 0 && curr_batch[idx - 1]->filename == target->filename && target->current_file) {
312-
// in case of pad_last_batch we can/should copy previous sample as target->current_file
313-
// has already been used to read the data and it moved the offset. If we read it will
314-
// return 0 - no more data to read from this file.
315-
target->data.Copy(curr_batch[idx - 1]->data);
316-
} else {
317-
target->data.Resize(target->shape, target->type);
318-
auto data_ptr = static_cast<uint8_t*>(target->data.raw_mutable_data());
319-
Index ret = target->current_file->Read(data_ptr, target->nbytes);
320-
DALI_ENFORCE(ret == static_cast<Index>(target->nbytes),
321-
make_string("Failed to read file: ", target->filename,
322-
", read: ", ret, " while it should be ", target->nbytes));
323-
}
317+
target->data.Resize(target->shape, target->type);
318+
auto data_ptr = static_cast<uint8_t*>(target->data.raw_mutable_data());
319+
Index ret = target->current_file->Read(data_ptr, target->nbytes);
320+
DALI_ENFORCE(ret == static_cast<Index>(target->nbytes),
321+
make_string("Failed to read file: ", target->filename,
322+
", read: ", ret, " while it should be ", target->nbytes));
324323
}
325324
}
326325
thread_pool_.RunAll();

0 commit comments

Comments
 (0)