Skip to content

Commit d3e6d73

Browse files
authored
Merge pull request #134 from r-lib/fix/dictionary-offset-na
2 parents de7c6d8 + 2e3ee31 commit d3e6d73

File tree

3 files changed

+28
-3
lines changed

3 files changed

+28
-3
lines changed

src/lib/ParquetReader.cpp

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -227,9 +227,6 @@ void ParquetReader::read_column_chunk_int(ColumnChunk &cc) {
227227
int64_t chunk_start =
228228
cc.has_dictionary ? dictionary_page_offset : data_page_offset;
229229

230-
// Give a chance to R to allocate memory for the column chunk
231-
alloc_column_chunk(cc);
232-
233230
// read in the whole chunk
234231
BufferGuard tmp_buf_g = bufman_cc->claim();
235232
ByteBuffer &tmp_buf = tmp_buf_g.buf;
@@ -239,6 +236,25 @@ void ParquetReader::read_column_chunk_int(ColumnChunk &cc) {
239236
uint8_t *ptr = (uint8_t*) tmp_buf.ptr;
240237
uint8_t *end = ptr + cmd.total_compressed_size;
241238

239+
// Polars does not set dictionary_page_offset :(
240+
// https://github.com/r-lib/nanoparquet/issues/132
241+
// We need to do this fix before calling alloc_column_chunk(), so the
242+
// callback correctly knows if this chunk has a dictionary page.
243+
// Sadly, this means that we are parsing the header of the first data
244+
// page twice, for files that adhere to the spec and don't have dict
245+
// pages. :((
246+
if (!cc.has_dictionary) {
247+
PageHeader dph;
248+
uint32_t ph_size = cmd.total_compressed_size;
249+
thrift_unpack(ptr, &ph_size, &dph, filename_);
250+
if (dph.type == parquet::PageType::DICTIONARY_PAGE) {
251+
cc.has_dictionary = true;
252+
}
253+
}
254+
255+
// Give a chance to R to allocate memory for the column chunk
256+
alloc_column_chunk(cc);
257+
242258
// dictionary page, if any
243259
if (cc.has_dictionary) {
244260
PageHeader dph;
872 Bytes
Binary file not shown.

tests/testthat/test-read-parquet-5.R

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,3 +226,12 @@ test_that("mixing RLE_DICTIONARY and PLAIN, FLOAT16", {
226226
bs2[is.na(t1[,2])] <- NA
227227
expect_equal(t1[,2], bs2)
228228
})
229+
230+
# https://github.com/r-lib/nanoparquet/issues/132
231+
test_that("dict page w/o dict offset set", {
232+
pf <- test_path("data/broken/polars-no-dict-offset.parquet")
233+
expect_equal(
234+
as.data.frame(read_parquet(pf)),
235+
data.frame(a = c(1,2,3), b = c(4,5,6))
236+
)
237+
})

0 commit comments

Comments
 (0)