Skip to content

Commit cd0a41f

Browse files
committed
Enable Multi Blocks in arrow::BinaryViewBuilder::AppendArraySlice
1 parent f1c6619 commit cd0a41f

File tree

2 files changed

+254
-34
lines changed

2 files changed

+254
-34
lines changed

cpp/src/arrow/array/array_test.cc

Lines changed: 224 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -976,28 +976,235 @@ TEST_F(TestArray, TestAppendArraySlice) {
976976
ASSERT_EQ(8, result->null_count());
977977
}
978978
}
979+
class TestBinaryViewBuilderAppendArraySlice : public TestArray {
980+
public:
981+
struct StringOptions {
982+
int64_t size;
983+
int32_t min_length;
984+
int32_t max_length;
985+
double null_probability;
986+
};
987+
TestBinaryViewBuilderAppendArraySlice() : TestArray(), generator_(12) {}
988+
void SetUp() override {
989+
TestArray::SetUp();
990+
builder_ = BinaryViewBuilder(pool_);
991+
}
992+
Result<std::shared_ptr<Array>> AppendArraySliceBySlice(
993+
const std::shared_ptr<Array>& array) {
994+
const ArraySpan span(*array->data());
995+
for (int i = 0; i < array->length(); i++) {
996+
ARROW_RETURN_NOT_OK(builder_.AppendArraySlice(span, i, 1));
997+
}
998+
return builder_.Finish();
999+
}
1000+
1001+
Result<std::shared_ptr<Array>> GenerateString(const StringOptions& option) {
1002+
return generator_
1003+
.StringView(option.size, option.min_length, option.max_length,
1004+
option.null_probability)
1005+
->View(binary_view());
1006+
}
1007+
1008+
protected:
1009+
BinaryViewBuilder builder_;
1010+
random::RandomArrayGenerator generator_;
1011+
};
1012+
1013+
TEST_F(TestBinaryViewBuilderAppendArraySlice, Inline) {
1014+
StringOptions options{};
1015+
options.size = 16;
1016+
options.min_length = 0;
1017+
options.max_length = 12;
1018+
options.null_probability = 0;
1019+
1020+
// Null values are not included
1021+
{
1022+
ASSERT_OK_AND_ASSIGN(auto src, GenerateString(options));
1023+
ArraySpan span(*src->data());
1024+
ASSERT_OK(builder_.AppendArraySlice(span, 0, src->length()));
1025+
ASSERT_OK_AND_ASSIGN(auto dst, builder_.Finish());
1026+
ASSERT_OK(dst->ValidateFull());
1027+
AssertArraysEqual(*dst, *src);
1028+
}
1029+
{
1030+
ASSERT_OK_AND_ASSIGN(auto src, GenerateString(options));
1031+
ASSERT_OK_AND_ASSIGN(auto dst, AppendArraySliceBySlice(src));
1032+
ASSERT_OK(dst->ValidateFull());
1033+
AssertArraysEqual(*dst, *src);
1034+
}
1035+
1036+
// Null values are included
1037+
options.null_probability = .2;
1038+
{
1039+
ASSERT_OK_AND_ASSIGN(auto src, GenerateString(options));
1040+
// Check for the existence of null values.
1041+
ASSERT_NE(src->data()->buffers[0], nullptr);
1042+
ArraySpan span(*src->data());
1043+
ASSERT_OK(builder_.AppendArraySlice(span, 0, src->length()));
1044+
ASSERT_OK_AND_ASSIGN(auto dst, builder_.Finish());
1045+
ASSERT_OK(dst->ValidateFull());
1046+
AssertArraysEqual(*dst, *src);
1047+
}
1048+
1049+
{
1050+
ASSERT_OK_AND_ASSIGN(auto src, GenerateString(options));
1051+
// Check for the existence of null values.
1052+
ASSERT_NE(src->data()->buffers[0], nullptr);
1053+
ASSERT_OK_AND_ASSIGN(auto dst, AppendArraySliceBySlice(src));
1054+
ASSERT_OK(dst->ValidateFull());
1055+
AssertArraysEqual(*dst, *src);
1056+
}
1057+
}
1058+
1059+
TEST_F(TestBinaryViewBuilderAppendArraySlice, NonInlineOneBLock) {
1060+
StringOptions options{};
1061+
options.size = 16;
1062+
options.min_length = 13;
1063+
options.max_length = 23;
1064+
options.null_probability = 0;
1065+
1066+
// Null values are not included
1067+
{
1068+
ASSERT_OK_AND_ASSIGN(auto src, GenerateString(options));
1069+
ArraySpan span(*src->data());
1070+
ASSERT_OK(builder_.AppendArraySlice(span, 0, src->length()));
1071+
ASSERT_OK_AND_ASSIGN(auto dst, builder_.Finish());
1072+
ASSERT_OK(dst->ValidateFull());
1073+
AssertArraysEqual(*dst, *src);
1074+
}
1075+
{
1076+
ASSERT_OK_AND_ASSIGN(auto src, GenerateString(options));
1077+
ASSERT_OK_AND_ASSIGN(auto dst, AppendArraySliceBySlice(src));
1078+
ASSERT_OK(dst->ValidateFull());
1079+
AssertArraysEqual(*dst, *src);
1080+
}
1081+
1082+
// Null values are included
1083+
options.null_probability = .2;
1084+
{
1085+
ASSERT_OK_AND_ASSIGN(auto src, GenerateString(options));
1086+
// Check for the existence of null values.
1087+
ASSERT_NE(src->data()->buffers[0], nullptr);
1088+
ArraySpan span(*src->data());
1089+
ASSERT_OK(builder_.AppendArraySlice(span, 0, src->length()));
1090+
ASSERT_OK_AND_ASSIGN(auto dst, builder_.Finish());
1091+
ASSERT_OK(dst->ValidateFull());
1092+
AssertArraysEqual(*dst, *src);
1093+
}
9791094

980-
// GH-39976: Test out-of-line data size calculation in
981-
// BinaryViewBuilder::AppendArraySlice.
982-
TEST_F(TestArray, TestBinaryViewAppendArraySlice) {
983-
BinaryViewBuilder src_builder(pool_);
984-
ASSERT_OK(src_builder.AppendNull());
985-
ASSERT_OK(src_builder.Append("long string; not inlined"));
986-
ASSERT_EQ(2, src_builder.length());
987-
ASSERT_OK_AND_ASSIGN(auto src, src_builder.Finish());
1095+
{
1096+
ASSERT_OK_AND_ASSIGN(auto src, GenerateString(options));
1097+
// Check for the existence of null values.
1098+
ASSERT_NE(src->data()->buffers[0], nullptr);
1099+
ASSERT_OK_AND_ASSIGN(auto dst, AppendArraySliceBySlice(src));
1100+
ASSERT_OK(dst->ValidateFull());
1101+
AssertArraysEqual(*dst, *src);
1102+
}
1103+
}
1104+
1105+
TEST_F(TestBinaryViewBuilderAppendArraySlice, InlineAndNonInlineOneBLock) {
1106+
StringOptions options{};
1107+
options.size = 40;
1108+
options.min_length = 1;
1109+
options.max_length = 23;
1110+
options.null_probability = 0;
1111+
// Null values are not included
1112+
{
1113+
ASSERT_OK_AND_ASSIGN(auto src, GenerateString(options));
1114+
ArraySpan span(*src->data());
1115+
ASSERT_OK(builder_.AppendArraySlice(span, 0, src->length()));
1116+
ASSERT_OK_AND_ASSIGN(auto dst, builder_.Finish());
1117+
ASSERT_OK(dst->ValidateFull());
1118+
AssertArraysEqual(*dst, *src);
1119+
}
1120+
{
1121+
ASSERT_OK_AND_ASSIGN(auto src, GenerateString(options));
1122+
ASSERT_OK_AND_ASSIGN(auto dst, AppendArraySliceBySlice(src));
1123+
ASSERT_OK(dst->ValidateFull());
1124+
AssertArraysEqual(*dst, *src);
1125+
}
1126+
1127+
options.null_probability = .2;
1128+
// Null values are included
1129+
{
1130+
ASSERT_OK_AND_ASSIGN(auto src, GenerateString(options));
1131+
// Check for the existence of null values.
1132+
ASSERT_NE(src->data()->buffers[0], nullptr);
1133+
ArraySpan span(*src->data());
1134+
ASSERT_OK(builder_.AppendArraySlice(span, 0, src->length()));
1135+
ASSERT_OK_AND_ASSIGN(auto dst, builder_.Finish());
1136+
ASSERT_OK(dst->ValidateFull());
1137+
AssertArraysEqual(*dst, *src);
1138+
}
1139+
1140+
{
1141+
ASSERT_OK_AND_ASSIGN(auto src, GenerateString(options));
1142+
// Check for the existence of null values.
1143+
ASSERT_NE(src->data()->buffers[0], nullptr);
1144+
ASSERT_OK_AND_ASSIGN(auto dst, AppendArraySliceBySlice(src));
1145+
ASSERT_OK(dst->ValidateFull());
1146+
AssertArraysEqual(*dst, *src);
1147+
}
1148+
}
1149+
1150+
TEST_F(TestBinaryViewBuilderAppendArraySlice,
1151+
LARGE_MEMORY_TEST(NonInlineMultipleBlocks)) {
1152+
std::string a(1 << 30, 'a');
1153+
ASSERT_OK(builder_.Append(a));
1154+
ASSERT_OK(builder_.AppendNull());
1155+
ASSERT_OK(builder_.Append(a));
1156+
ASSERT_OK(builder_.AppendNull());
1157+
ASSERT_OK(builder_.Append(a));
1158+
ASSERT_OK_AND_ASSIGN(auto src, builder_.Finish())
9881159
ASSERT_OK(src->ValidateFull());
9891160

9901161
ArraySpan span;
9911162
span.SetMembers(*src->data());
992-
BinaryViewBuilder dst_builder(pool_);
993-
ASSERT_OK(dst_builder.AppendArraySlice(span, 0, 1));
994-
ASSERT_EQ(1, dst_builder.length());
995-
ASSERT_OK(dst_builder.AppendArraySlice(span, 1, 1));
996-
ASSERT_EQ(2, dst_builder.length());
997-
ASSERT_OK_AND_ASSIGN(auto dst, dst_builder.Finish());
998-
ASSERT_OK(dst->ValidateFull());
999-
1000-
AssertArraysEqual(*src, *dst);
1163+
{
1164+
ASSERT_OK(builder_.AppendArraySlice(span, 0, src->length()));
1165+
ASSERT_OK_AND_ASSIGN(auto dst, builder_.Finish());
1166+
ASSERT_OK(dst->ValidateFull());
1167+
AssertArraysEqual(*src, *dst);
1168+
}
1169+
1170+
{
1171+
ASSERT_OK_AND_ASSIGN(auto dst, AppendArraySliceBySlice(src));
1172+
ASSERT_OK(dst->ValidateFull());
1173+
AssertArraysEqual(*src, *dst);
1174+
}
1175+
}
1176+
1177+
TEST_F(TestBinaryViewBuilderAppendArraySlice,
1178+
LARGE_MEMORY_TEST(InlineAndNonInlineMultipleBlocks)) {
1179+
std::string large_a(1 << 30, 'a');
1180+
std::string small_a(1 << 14, 'a');
1181+
ASSERT_OK(builder_.Append(large_a));
1182+
ASSERT_EQ(builder_.current_block_bytes_remaining(), 0);
1183+
ASSERT_OK(builder_.AppendNull());
1184+
ASSERT_OK(builder_.Append(small_a));
1185+
ASSERT_OK(builder_.AppendNull());
1186+
ASSERT_OK(builder_.Append("123456"));
1187+
ASSERT_EQ(builder_.current_block_bytes_remaining(), 1 << 14);
1188+
ASSERT_OK(builder_.Append(large_a));
1189+
ASSERT_OK(builder_.AppendNull());
1190+
ASSERT_OK(builder_.Append(large_a));
1191+
ASSERT_OK_AND_ASSIGN(auto src, builder_.Finish());
1192+
ASSERT_OK(src->ValidateFull());
1193+
1194+
ArraySpan span;
1195+
span.SetMembers(*src->data());
1196+
{
1197+
ASSERT_OK(builder_.AppendArraySlice(span, 0, src->length()));
1198+
ASSERT_OK_AND_ASSIGN(auto dst, builder_.Finish());
1199+
ASSERT_OK(dst->ValidateFull());
1200+
AssertArraysEqual(*src, *dst);
1201+
}
1202+
1203+
{
1204+
ASSERT_OK_AND_ASSIGN(auto dst, AppendArraySliceBySlice(src));
1205+
ASSERT_OK(dst->ValidateFull());
1206+
AssertArraysEqual(*src, *dst);
1207+
}
10011208
}
10021209

10031210
TEST_F(TestArray, ValidateBuffersPrimitive) {

cpp/src/arrow/array/builder_binary.cc

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -50,30 +50,43 @@ BinaryViewBuilder::BinaryViewBuilder(const std::shared_ptr<DataType>& type,
5050
Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t offset,
5151
int64_t length) {
5252
auto bitmap = array.GetValues<uint8_t>(0, 0);
53+
RETURN_NOT_OK(Reserve(length));
5354
auto values = array.GetValues<BinaryViewType::c_type>(1) + offset;
54-
55-
int64_t out_of_line_total = 0, i = 0;
56-
VisitNullBitmapInline(
55+
auto AddString = [&](int64_t start, int64_t end, int64_t size) {
56+
RETURN_NOT_OK(ReserveData(size));
57+
for (int64_t i = start; i < end; i++) {
58+
if (bitmap && !bit_util::GetBit(bitmap, array.offset + i + offset)) {
59+
UnsafeAppendNull();
60+
continue;
61+
}
62+
63+
UnsafeAppend(util::FromBinaryView(values[i], array.GetVariadicBuffers().data()));
64+
}
65+
return Status::OK();
66+
};
67+
int64_t out_of_line_total = 0, i = 0, start = 0;
68+
ARROW_RETURN_NOT_OK(VisitNullBitmapInline(
5769
array.buffers[0].data, array.offset + offset, length, array.null_count,
5870
[&] {
5971
if (!values[i].is_inline()) {
60-
out_of_line_total += static_cast<int64_t>(values[i].size());
72+
auto current_length_value = static_cast<int64_t>(values[i].size());
73+
if (out_of_line_total + current_length_value >
74+
data_heap_builder_.ValueSizeLimit()) {
75+
ARROW_RETURN_NOT_OK(AddString(start, i, out_of_line_total));
76+
out_of_line_total = current_length_value;
77+
start = i;
78+
} else {
79+
out_of_line_total += current_length_value;
80+
}
6181
}
6282
++i;
83+
return Status::OK();
6384
},
64-
[&] { ++i; });
65-
66-
RETURN_NOT_OK(Reserve(length));
67-
RETURN_NOT_OK(ReserveData(out_of_line_total));
68-
69-
for (int64_t i = 0; i < length; i++) {
70-
if (bitmap && !bit_util::GetBit(bitmap, array.offset + offset + i)) {
71-
UnsafeAppendNull();
72-
continue;
73-
}
74-
75-
UnsafeAppend(util::FromBinaryView(values[i], array.GetVariadicBuffers().data()));
76-
}
85+
[&] {
86+
++i;
87+
return Status::OK();
88+
}));
89+
ARROW_RETURN_NOT_OK(AddString(start, i, out_of_line_total));
7790
return Status::OK();
7891
}
7992

0 commit comments

Comments
 (0)