@@ -227,9 +227,6 @@ void ParquetReader::read_column_chunk_int(ColumnChunk &cc) {
227227 int64_t chunk_start =
228228 cc.has_dictionary ? dictionary_page_offset : data_page_offset;
229229
230- // Give a chance to R to allocate memory for the column chunk
231- alloc_column_chunk (cc);
232-
233230 // read in the whole chunk
234231 BufferGuard tmp_buf_g = bufman_cc->claim ();
235232 ByteBuffer &tmp_buf = tmp_buf_g.buf ;
@@ -239,6 +236,25 @@ void ParquetReader::read_column_chunk_int(ColumnChunk &cc) {
239236 uint8_t *ptr = (uint8_t *) tmp_buf.ptr ;
240237 uint8_t *end = ptr + cmd.total_compressed_size ;
241238
239+ // Polars does not set dictionary_page_offset :(
240+ // https://github.com/r-lib/nanoparquet/issues/132
241+ // We need to do this fix before calling alloc_column_chunk(), so the
242+ // callback correctly knows if this chunk has a dictionary page.
243+ // Sadly, this means that we are parsing the header of the first data
244+ // page twice, for files that adhere to the spec and don't have dict
245+ // pages. :((
246+ if (!cc.has_dictionary ) {
247+ PageHeader dph;
248+ uint32_t ph_size = cmd.total_compressed_size ;
249+ thrift_unpack (ptr, &ph_size, &dph, filename_);
250+ if (dph.type == parquet::PageType::DICTIONARY_PAGE) {
251+ cc.has_dictionary = true ;
252+ }
253+ }
254+
255+ // Give a chance to R to allocate memory for the column chunk
256+ alloc_column_chunk (cc);
257+
242258 // dictionary page, if any
243259 if (cc.has_dictionary ) {
244260 PageHeader dph;
0 commit comments