Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cpp/src/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ set(PARQUET_SRCS
encryption/internal_file_encryptor.cc
exception.cc
file_reader.cc
file_rewriter.cc
file_writer.cc
geospatial/statistics.cc
geospatial/util_internal.cc
Expand Down Expand Up @@ -408,6 +409,8 @@ add_parquet_test(arrow-reader-writer-test
arrow/arrow_statistics_test.cc
arrow/variant_test.cc)

add_parquet_test(arrow-rewriter-test SOURCES arrow/arrow_rewriter_test.cc)

add_parquet_test(arrow-internals-test SOURCES arrow/path_internal_test.cc
arrow/reconstruct_internal_test.cc)

Expand Down
48 changes: 16 additions & 32 deletions cpp/src/parquet/arrow/arrow_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -375,19 +375,19 @@ const double test_traits<::arrow::DoubleType>::value(4.2);
template <>
struct test_traits<::arrow::StringType> {
static constexpr ParquetType::type parquet_enum = ParquetType::BYTE_ARRAY;
static std::string const value;
static const std::string value;
};

template <>
struct test_traits<::arrow::BinaryType> {
static constexpr ParquetType::type parquet_enum = ParquetType::BYTE_ARRAY;
static std::string const value;
static const std::string value;
};

template <>
struct test_traits<::arrow::FixedSizeBinaryType> {
static constexpr ParquetType::type parquet_enum = ParquetType::FIXED_LEN_BYTE_ARRAY;
static std::string const value;
static const std::string value;
};

const std::string test_traits<::arrow::StringType>::value("Test"); // NOLINT
Expand Down Expand Up @@ -5906,28 +5906,6 @@ auto encode_double = [](double value) {

class ParquetPageIndexRoundTripTest : public ::testing::Test {
public:
void WriteFile(const std::shared_ptr<WriterProperties>& writer_properties,
const std::shared_ptr<::arrow::Table>& table) {
// Get schema from table.
auto schema = table->schema();
std::shared_ptr<SchemaDescriptor> parquet_schema;
auto arrow_writer_properties = default_arrow_writer_properties();
ASSERT_OK_NO_THROW(ToParquetSchema(schema.get(), *writer_properties,
*arrow_writer_properties, &parquet_schema));
auto schema_node = std::static_pointer_cast<GroupNode>(parquet_schema->schema_root());

// Write table to buffer.
auto sink = CreateOutputStream();
auto pool = ::arrow::default_memory_pool();
auto writer = ParquetFileWriter::Open(sink, schema_node, writer_properties);
std::unique_ptr<FileWriter> arrow_writer;
ASSERT_OK(FileWriter::Make(pool, std::move(writer), schema, arrow_writer_properties,
&arrow_writer));
ASSERT_OK_NO_THROW(arrow_writer->WriteTable(*table));
ASSERT_OK_NO_THROW(arrow_writer->Close());
ASSERT_OK_AND_ASSIGN(buffer_, sink->Finish());
}

void ReadPageIndexes(int expect_num_row_groups, int expect_num_pages,
const std::set<int>& expect_columns_without_index = {}) {
auto read_properties = default_arrow_reader_properties();
Expand Down Expand Up @@ -6015,7 +5993,8 @@ TEST_F(ParquetPageIndexRoundTripTest, SimpleRoundTrip) {
[null, "d", [] ],
[5, null, [3, 3, 3]],
[6, "f", null ]
])"}));
])"}),
buffer_);

ReadPageIndexes(/*expect_num_row_groups=*/2, /*expect_num_pages=*/1);

Expand Down Expand Up @@ -6057,7 +6036,8 @@ TEST_F(ParquetPageIndexRoundTripTest, SimpleRoundTripWithStatsDisabled) {
[null, "d", [] ],
[5, null, [3, 3, 3]],
[6, "f", null ]
])"}));
])"}),
buffer_);

ReadPageIndexes(/*expect_num_row_groups=*/1, /*expect_num_pages=*/1);
for (auto& column_index : column_indexes_) {
Expand All @@ -6082,7 +6062,8 @@ TEST_F(ParquetPageIndexRoundTripTest, SimpleRoundTripWithColumnStatsDisabled) {
[null, "d", [] ],
[5, null, [3, 3, 3]],
[6, "f", null ]
])"}));
])"}),
buffer_);

ReadPageIndexes(/*expect_num_row_groups=*/2, /*expect_num_pages=*/1);

Expand Down Expand Up @@ -6116,7 +6097,8 @@ TEST_F(ParquetPageIndexRoundTripTest, DropLargeStats) {
WriteFile(writer_properties, ::arrow::TableFromJSON(schema, {R"([
["short_string"],
["very_large_string_to_drop_stats"]
])"}));
])"}),
buffer_);

ReadPageIndexes(/*expect_num_row_groups=*/2, /*expect_num_pages=*/1);

Expand All @@ -6140,7 +6122,8 @@ TEST_F(ParquetPageIndexRoundTripTest, MultiplePages) {
writer_properties,
::arrow::TableFromJSON(
schema, {R"([[1, "a"], [2, "b"]])", R"([[3, "c"], [4, "d"]])",
R"([[null, null], [6, "f"]])", R"([[null, null], [null, null]])"}));
R"([[null, null], [6, "f"]])", R"([[null, null], [null, null]])"}),
buffer_);

ReadPageIndexes(/*expect_num_row_groups=*/1, /*expect_num_pages=*/4);

Expand Down Expand Up @@ -6180,7 +6163,7 @@ TEST_F(ParquetPageIndexRoundTripTest, DoubleWithNaNs) {

auto schema = ::arrow::schema({::arrow::field("c0", ::arrow::float64())});
auto table = Table::Make(schema, {chunked_array});
WriteFile(writer_properties, table);
WriteFile(writer_properties, table, buffer_);

ReadPageIndexes(/*expect_num_row_groups=*/4, /*expect_num_pages=*/1);

Expand Down Expand Up @@ -6215,7 +6198,8 @@ TEST_F(ParquetPageIndexRoundTripTest, EnablePerColumn) {
->enable_write_page_index("c0") /* enable c0 explicitly */
->disable_write_page_index("c1") /* disable c1 explicitly */
->build();
WriteFile(writer_properties, ::arrow::TableFromJSON(schema, {R"([[0, 1, 2]])"}));
WriteFile(writer_properties, ::arrow::TableFromJSON(schema, {R"([[0, 1, 2]])"}),
buffer_);

ReadPageIndexes(/*expect_num_row_groups=*/1, /*expect_num_pages=*/1,
/*expect_columns_without_index=*/{1});
Expand Down
185 changes: 185 additions & 0 deletions cpp/src/parquet/arrow/arrow_rewriter_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/io/memory.h"
#include "arrow/testing/gtest_util.h"
#include "parquet/arrow/reader.h"
#include "parquet/file_reader.h"
#include "parquet/file_rewriter.h"
#ifdef _MSC_VER
# pragma warning(push)
// Disable forcing value to bool warnings
# pragma warning(disable : 4800)
#endif

#include <memory>

#include "gtest/gtest.h"

#include "parquet/arrow/test_util.h"
#include "parquet/platform.h"
#include "parquet/properties.h"

using arrow::Table;
using arrow::io::BufferReader;

namespace parquet::arrow {

TEST(ParquetRewriterTest, ConcatRoundTrip) {
auto rewriter_properties =
RewriterProperties::Builder()
.writer_properties(
WriterProperties::Builder().enable_write_page_index()->build())
->build();

auto schema = ::arrow::schema(
{::arrow::field("a", ::arrow::int32()), ::arrow::field("b", ::arrow::utf8())});

std::shared_ptr<Buffer> buffer_up;
std::shared_ptr<Buffer> buffer_down;

WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(schema, {R"([[1, "a"], [2, "b"]])"}), buffer_up);
WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(schema, {R"([[3, "c"]])"}), buffer_down);

auto sink = CreateOutputStream();
auto rewriter =
ParquetFileRewriter::Open({{std::make_shared<BufferReader>(buffer_up),
std::make_shared<BufferReader>(buffer_down)}},
sink, {{NULLPTR, NULLPTR}}, NULLPTR, rewriter_properties);
rewriter->Rewrite();
rewriter->Close();

ASSERT_OK_AND_ASSIGN(std::shared_ptr<Buffer> out_buffer, sink->Finish());
auto file_reader = ParquetFileReader::Open(std::make_shared<BufferReader>(out_buffer));
std::unique_ptr<FileReader> reader;
ASSERT_OK_NO_THROW(
FileReader::Make(::arrow::default_memory_pool(), std::move(file_reader), &reader));

std::shared_ptr<Table> table;
ASSERT_OK(reader->ReadTable(&table));
ASSERT_OK(table->ValidateFull());

auto expected_table =
::arrow::TableFromJSON(schema, {R"([[1, "a"], [2, "b"], [3, "c"]])"});
AssertTablesEqual(*expected_table, *table);
}

TEST(ParquetRewriterTest, DISABLED_ExtendRoundTrip) {
auto rewriter_properties =
RewriterProperties::Builder()
.writer_properties(
WriterProperties::Builder().enable_write_page_index()->build())
->build();

auto left_schema = ::arrow::schema(
{::arrow::field("a", ::arrow::int32()), ::arrow::field("b", ::arrow::utf8())});
auto right_schema = ::arrow::schema({::arrow::field("c", ::arrow::int64())});

std::shared_ptr<Buffer> buffer_left;
std::shared_ptr<Buffer> buffer_right;

WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(left_schema, {R"([[1, "a"], [2, "b"], [3, "c"]])"}),
buffer_left);
WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(right_schema, {R"([[10], [20], [30]])"}),
buffer_right);

auto sink = CreateOutputStream();
auto rewriter = ParquetFileRewriter::Open(
{{std::make_shared<BufferReader>(buffer_left)},
{std::make_shared<BufferReader>(buffer_right)}},
sink, {{NULLPTR}, {NULLPTR}}, NULLPTR, rewriter_properties);
rewriter->Rewrite();
rewriter->Close();

ASSERT_OK_AND_ASSIGN(std::shared_ptr<Buffer> out_buffer, sink->Finish());
auto file_reader = ParquetFileReader::Open(std::make_shared<BufferReader>(out_buffer));
std::unique_ptr<FileReader> reader;
ASSERT_OK_NO_THROW(
FileReader::Make(::arrow::default_memory_pool(), std::move(file_reader), &reader));

std::shared_ptr<Table> table;
ASSERT_OK(reader->ReadTable(&table));
ASSERT_OK(table->ValidateFull());

auto expected_schema = ::arrow::schema({::arrow::field("a", ::arrow::int32()),
::arrow::field("b", ::arrow::utf8()),
::arrow::field("c", ::arrow::int64())});
auto expected_table = ::arrow::TableFromJSON(
expected_schema, {R"([[1, "a", 10], [2, "b", 20], [3, "c", 30]])"});
AssertTablesEqual(*expected_table, *table);
}

TEST(ParquetRewriterTest, DISABLED_SimpleRoundTrip) {
auto rewriter_properties = RewriterProperties::Builder()
.writer_properties(WriterProperties::Builder()
.enable_write_page_index()
->max_row_group_length(1)
->build())
->build();

auto left_schema = ::arrow::schema(
{::arrow::field("a", ::arrow::int32()), ::arrow::field("b", ::arrow::utf8())});
auto right_schema = ::arrow::schema({::arrow::field("c", ::arrow::int64())});

std::shared_ptr<Buffer> buffer_left_up;
std::shared_ptr<Buffer> buffer_left_down;
std::shared_ptr<Buffer> buffer_right_up;
std::shared_ptr<Buffer> buffer_right_down;

WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(left_schema, {R"([[1, "a"], [2, "b"]])"}),
buffer_left_up);
WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(left_schema, {R"([[3, "c"]])"}), buffer_left_down);
WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(right_schema, {R"([[10]])"}), buffer_right_up);
WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(right_schema, {R"([[20], [30]])"}), buffer_right_down);

auto sink = CreateOutputStream();
auto rewriter = ParquetFileRewriter::Open(
{{std::make_shared<BufferReader>(buffer_left_up),
std::make_shared<BufferReader>(buffer_left_down)},
{std::make_shared<BufferReader>(buffer_right_up),
std::make_shared<BufferReader>(buffer_right_down)}},
sink, {{NULLPTR, NULLPTR}, {NULLPTR, NULLPTR}}, NULLPTR, rewriter_properties);
rewriter->Rewrite();
rewriter->Close();

ASSERT_OK_AND_ASSIGN(std::shared_ptr<Buffer> out_buffer, sink->Finish());
auto file_reader = ParquetFileReader::Open(std::make_shared<BufferReader>(out_buffer));
std::unique_ptr<FileReader> reader;
ASSERT_OK_NO_THROW(
FileReader::Make(::arrow::default_memory_pool(), std::move(file_reader), &reader));

std::shared_ptr<Table> table;
ASSERT_OK(reader->ReadTable(&table));
ASSERT_OK(table->ValidateFull());

auto expected_schema = ::arrow::schema({::arrow::field("a", ::arrow::int32()),
::arrow::field("b", ::arrow::utf8()),
::arrow::field("c", ::arrow::int64())});
auto expected_table = ::arrow::TableFromJSON(
expected_schema, {R"([[1, "a", 10], [2, "b", 20], [3, "c", 30]])"});
AssertTablesEqual(*expected_table, *table);
}

} // namespace parquet::arrow
28 changes: 28 additions & 0 deletions cpp/src/parquet/arrow/test_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,23 @@
#include "arrow/array/builder_binary.h"
#include "arrow/array/builder_decimal.h"
#include "arrow/array/builder_primitive.h"
#include "arrow/table.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"
#include "arrow/type_fwd.h"
#include "arrow/type_traits.h"
#include "arrow/util/decimal.h"
#include "arrow/util/float16.h"
#include "parquet/arrow/schema.h"
#include "parquet/arrow/writer.h"
#include "parquet/column_reader.h"
#include "parquet/file_writer.h"
#include "parquet/test_util.h"

namespace parquet {

using internal::RecordReader;
using schema::GroupNode;

namespace arrow {

Expand Down Expand Up @@ -482,6 +487,29 @@ void ExpectArrayT<::arrow::BooleanType>(void* expected, Array* result) {
EXPECT_TRUE(result->Equals(*expected_array));
}

void WriteFile(const std::shared_ptr<WriterProperties>& writer_properties,
const std::shared_ptr<::arrow::Table>& table,
std::shared_ptr<Buffer>& buffer) {
// Get schema from table.
auto schema = table->schema();
std::shared_ptr<SchemaDescriptor> parquet_schema;
auto arrow_writer_properties = default_arrow_writer_properties();
ASSERT_OK_NO_THROW(ToParquetSchema(schema.get(), *writer_properties,
*arrow_writer_properties, &parquet_schema));
auto schema_node = std::static_pointer_cast<GroupNode>(parquet_schema->schema_root());

// Write table to buffer.
auto sink = CreateOutputStream();
auto pool = ::arrow::default_memory_pool();
auto writer = ParquetFileWriter::Open(sink, schema_node, writer_properties);
std::unique_ptr<FileWriter> arrow_writer;
ASSERT_OK(FileWriter::Make(pool, std::move(writer), schema, arrow_writer_properties,
&arrow_writer));
ASSERT_OK_NO_THROW(arrow_writer->WriteTable(*table));
ASSERT_OK_NO_THROW(arrow_writer->Close());
ASSERT_OK_AND_ASSIGN(buffer, sink->Finish());
}

} // namespace arrow

} // namespace parquet
Loading
Loading