Skip to content

Commit 8b3bbb0

Browse files
authored
Merge branch 'main' into zhf-refractor
2 parents d9d2f4e + 0dbbda6 commit 8b3bbb0

31 files changed

Lines changed: 267 additions & 5 deletions

test/inte/blob_table_inte_test.cpp

Lines changed: 85 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@
1616

1717
#include <cstdint>
1818
#include <cstdlib>
19-
#include <filesystem>
20-
#include <fstream>
19+
#include <initializer_list>
2120
#include <map>
2221
#include <memory>
2322
#include <numeric>
23+
#include <optional>
2424
#include <set>
2525
#include <string>
2626
#include <utility>
@@ -40,6 +40,7 @@
4040
#include "paimon/common/data/binary_array_writer.h"
4141
#include "paimon/common/data/binary_row.h"
4242
#include "paimon/common/data/binary_row_writer.h"
43+
#include "paimon/common/data/blob_descriptor.h"
4344
#include "paimon/common/data/blob_view_struct.h"
4445
#include "paimon/common/factories/io_hook.h"
4546
#include "paimon/common/table/special_fields.h"
@@ -345,19 +346,58 @@ class BlobTableInteTest : public testing::Test, public ::testing::WithParamInter
345346
});
346347
}
347348

349+
struct BlobDescriptorPathRewrite {
350+
std::string table_path;
351+
std::vector<std::string> table_relative_blob_dirs;
352+
};
353+
354+
static std::optional<std::string> TryRewriteDescriptorUri(
355+
const std::string& descriptor_uri, const BlobDescriptorPathRewrite& rewrite,
356+
const std::shared_ptr<LocalFileSystem>& fs) {
357+
if (rewrite.table_path.empty()) {
358+
return std::nullopt;
359+
}
360+
361+
for (const auto& blob_dir : rewrite.table_relative_blob_dirs) {
362+
const std::string marker = "/" + blob_dir + "/";
363+
auto marker_pos = descriptor_uri.find(marker);
364+
if (marker_pos != std::string::npos) {
365+
std::string relative_blob_path = descriptor_uri.substr(marker_pos + 1);
366+
return PathUtil::JoinPath(rewrite.table_path, relative_blob_path);
367+
}
368+
}
369+
return std::nullopt;
370+
}
371+
348372
/// Convert a StructArray with serialized BlobDescriptor bytes back to a StructArray
349373
/// with raw blob bytes. Only blob fields are resolved; other columns (including
350374
/// _VALUE_KIND) are kept as-is.
351375
Result<std::shared_ptr<arrow::StructArray>> ConvertDescriptorToRawBlob(
352376
const std::shared_ptr<arrow::StructArray>& desc_array,
353-
const std::set<std::string>& blob_fields) const {
377+
const std::set<std::string>& blob_fields,
378+
const BlobDescriptorPathRewrite& rewrite = {}) const {
354379
auto fs = std::make_shared<LocalFileSystem>();
355380
return TransformBlobFields(
356381
desc_array, blob_fields,
357382
[&](const std::string_view& descriptor_bytes,
358383
arrow::LargeBinaryBuilder* builder) -> Status {
359-
PAIMON_ASSIGN_OR_RAISE(auto blob, Blob::FromDescriptor(descriptor_bytes.data(),
360-
descriptor_bytes.size()));
384+
PAIMON_ASSIGN_OR_RAISE(
385+
auto descriptor,
386+
BlobDescriptor::Deserialize(descriptor_bytes.data(), descriptor_bytes.size()));
387+
std::string descriptor_uri = descriptor->Uri();
388+
auto rewritten_uri = TryRewriteDescriptorUri(descriptor_uri, rewrite, fs);
389+
if (rewritten_uri.has_value()) {
390+
descriptor_uri = rewritten_uri.value();
391+
}
392+
393+
PAIMON_ASSIGN_OR_RAISE(
394+
auto rewritten_descriptor,
395+
BlobDescriptor::Create(descriptor->Version(), descriptor_uri,
396+
descriptor->Offset(), descriptor->Length()));
397+
auto rewritten_descriptor_bytes = rewritten_descriptor->Serialize(pool_);
398+
PAIMON_ASSIGN_OR_RAISE(auto blob,
399+
Blob::FromDescriptor(rewritten_descriptor_bytes->data(),
400+
rewritten_descriptor_bytes->size()));
361401
PAIMON_ASSIGN_OR_RAISE(auto data, blob->ToData(fs, pool_));
362402
PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(data->data(), data->size()));
363403
return Status::OK();
@@ -3008,4 +3048,44 @@ TEST_P(BlobTableInteTest, TestBlobViewWithFallbackPath) {
30083048
<< "expected:" << expected_with_rk->ToString();
30093049
}
30103050

3051+
TEST_P(BlobTableInteTest, TestReadBlobDescriptorFieldFromJava) {
3052+
auto file_format = GetParam();
3053+
if (file_format != "orc" && file_format != "parquet") {
3054+
return;
3055+
}
3056+
std::string table_path =
3057+
GetDataDir() + "/" + file_format +
3058+
"/blob_desc_field_with_external_path.db/blob_desc_field_with_external_path";
3059+
arrow::FieldVector fields = {
3060+
arrow::field("f0", arrow::int32()), BlobUtils::ToArrowField("b0", true),
3061+
BlobUtils::ToArrowField("b1", true), BlobUtils::ToArrowField("b2", true),
3062+
BlobUtils::ToArrowField("b3", true)};
3063+
auto schema = arrow::schema(fields);
3064+
// b0: all non-null, b1: has nulls, b2: all non-null, b3: has nulls
3065+
std::string raw_json = R"([
3066+
[1, "img_0", null, "raw_2_0", "raw_3_0"],
3067+
[2, "img_1", "vid_1", "raw_2_1", null ],
3068+
[3, "img_2", null, "raw_2_2", "raw_3_2" ]
3069+
])";
3070+
auto raw_array = std::dynamic_pointer_cast<arrow::StructArray>(
3071+
arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), raw_json).ValueOrDie());
3072+
3073+
ASSERT_OK_AND_ASSIGN(auto plan, ScanTable(table_path));
3074+
std::map<std::string, std::string> read_options = {{Options::BLOB_AS_DESCRIPTOR, "false"}};
3075+
ASSERT_OK_AND_ASSIGN(auto result, ReadTable(table_path, schema->field_names(), plan,
3076+
/*predicate=*/nullptr, read_options));
3077+
ASSERT_TRUE(result.chunked_array);
3078+
auto read_concat = arrow::Concatenate(result.chunked_array->chunks()).ValueOrDie();
3079+
auto read_struct = std::dynamic_pointer_cast<arrow::StructArray>(read_concat);
3080+
3081+
// After read, b0 and b1 are both descriptor-stored; resolve all back to raw bytes.
3082+
// Java-generated descriptors may contain absolute paths from the generation machine.
3083+
// Rewrite them to the portable blob directories inside the copied table path.
3084+
BlobDescriptorPathRewrite rewrite{table_path, {"raw_blob", "external_blob"}};
3085+
ASSERT_OK_AND_ASSIGN(auto resolved,
3086+
ConvertDescriptorToRawBlob(read_struct, {"b0", "b1"}, rewrite));
3087+
ASSERT_OK_AND_ASSIGN(auto expected_with_rk, PrependRowKindColumn(raw_array));
3088+
ASSERT_TRUE(resolved->Equals(expected_with_rk));
3089+
}
3090+
30113091
} // namespace paimon::test
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
f0:int b0:blob b1:blob b2:blob b3:blob (all can be null)
2+
bucket count: -1
3+
target-file-size: 700
4+
row-tracking.enabled: true
5+
data-evolution.enabled: true
6+
blob-descriptor-field: b0,b1
7+
blob-external-storage-field: b1
8+
blob-external-storage-path: <table>/external_blob (absolute path at generation time)
9+
10+
b0: descriptor field, inline in main file, source .bin files in raw_blob/
11+
b1: descriptor field, repacked to external storage in external_blob/
12+
b2: regular blob, written to .blob files
13+
b3: regular blob, written to .blob files
14+
15+
Note: b0 is passed as descriptor via Blob.fromLocal(); b1/b2/b3 are raw bytes.
16+
Paimon auto-converts b1 to descriptor internally.
17+
18+
Msgs:
19+
snapshot-1
20+
write field: "f0", "b0", "b1", "b2", "b3"
21+
Add: 1, "img_0", null, "raw_2_0", "raw_3_0"
22+
Add: 2, "img_1", "vid_1", "raw_2_1", null
23+
Add: 3, "img_2", null, "raw_2_2", "raw_3_2"
24+
NoCompact
25+
26+
C++ read note:
27+
Descriptor URIs contain absolute paths from the Java generation machine.
28+
ConvertDescriptorToRawBlob uses BlobDescriptorPathRewrite{"raw_blob", "external_blob"}
29+
to redirect them to <table>/raw_blob/ and <table>/external_blob/ at read time.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
img_0

0 commit comments

Comments
 (0)