Merge pull request #134 from r-lib/fix/dictionary-offset-na

gaborcsardi · web-flow · commit d3e6d738ff86 · 2025-02-22T11:04:14.000+01:00
diff --git a/src/lib/ParquetReader.cpp b/src/lib/ParquetReader.cpp
@@ -227,9 +227,6 @@ void ParquetReader::read_column_chunk_int(ColumnChunk &cc) {
   int64_t chunk_start =
     cc.has_dictionary ? dictionary_page_offset : data_page_offset;
 
-  // Give a chance to R to allocate memory for the column chunk
-  alloc_column_chunk(cc);
-
   // read in the whole chunk
   BufferGuard tmp_buf_g = bufman_cc->claim();
   ByteBuffer &tmp_buf = tmp_buf_g.buf;
@@ -239,6 +236,25 @@ void ParquetReader::read_column_chunk_int(ColumnChunk &cc) {
   uint8_t *ptr = (uint8_t*) tmp_buf.ptr;
   uint8_t *end = ptr + cmd.total_compressed_size;
 
+  // Polars does not set dictionary_page_offset :(
+  // https://github.com/r-lib/nanoparquet/issues/132
+  // We need to do this fix before calling alloc_column_chunk(), so the
+  // callback correctly knows if this chunk has a dictionary page.
+  // Sadly, this means that we are parsing the header of the first data
+  // page twice, for files that adhere to the spec and don't have dict
+  // pages. :((
+  if (!cc.has_dictionary) {
+    PageHeader dph;
+    uint32_t ph_size = cmd.total_compressed_size;
+    thrift_unpack(ptr, &ph_size, &dph, filename_);
+    if (dph.type == parquet::PageType::DICTIONARY_PAGE) {
+      cc.has_dictionary = true;
+    }
+  }
+
+  // Give a chance to R to allocate memory for the column chunk
+  alloc_column_chunk(cc);
+
   // dictionary page, if any
   if (cc.has_dictionary) {
     PageHeader dph;
diff --git a/tests/testthat/data/broken/polars-no-dict-offset.parquet b/tests/testthat/data/broken/polars-no-dict-offset.parquet
diff --git a/tests/testthat/test-read-parquet-5.R b/tests/testthat/test-read-parquet-5.R
@@ -226,3 +226,12 @@ test_that("mixing RLE_DICTIONARY and PLAIN, FLOAT16", {
   bs2[is.na(t1[,2])] <- NA
   expect_equal(t1[,2], bs2)
 })
+
+# https://github.com/r-lib/nanoparquet/issues/132
+test_that("dict page w/o dict offset set", {
+  pf <- test_path("data/broken/polars-no-dict-offset.parquet")
+  expect_equal(
+    as.data.frame(read_parquet(pf)),
+    data.frame(a = c(1,2,3), b = c(4,5,6))
+  )
+})