Skip to content

Commit bb7fca9

Browse files
committed
fix: make byte hashing independent of char signedness
- Treat Hive byte and byte-array hashing as signed Java bytes - Make Murmur tail-byte hashing use explicit int8_t values
1 parent 7b74e1c commit bb7fca9

3 files changed

Lines changed: 15 additions & 8 deletions

File tree

src/paimon/common/utils/murmurhash_utils.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,9 @@ class MurmurHashUtils {
182182
int32_t length_aligned = length_in_bytes - length_in_bytes % 4;
183183
int32_t h1 = HashBytesByInt(segment, offset, length_aligned, seed);
184184
for (int32_t i = length_aligned; i < length_in_bytes; i++) {
185-
int32_t k1 = MixK1(segment.Get(offset + i));
185+
auto byte = static_cast<uint8_t>(segment.Get(offset + i));
186+
int32_t signed_byte = byte < 128 ? byte : static_cast<int32_t>(byte) - 256;
187+
int32_t k1 = MixK1(signed_byte);
186188
h1 = MixH1(h1, k1);
187189
}
188190
return Fmix(h1, length_in_bytes);
@@ -238,10 +240,10 @@ class MurmurHashUtils {
238240
return value;
239241
}
240242

241-
static char GetByte(const void* base, int64_t offset) {
242-
char value;
243-
std::memcpy(&value, static_cast<const char*>(base) + offset, sizeof(char));
244-
return value;
243+
static int32_t GetByte(const void* base, int64_t offset) {
244+
uint8_t value;
245+
std::memcpy(&value, static_cast<const char*>(base) + offset, sizeof(uint8_t));
246+
return value < 128 ? value : static_cast<int32_t>(value) - 256;
245247
}
246248

247249
public:

src/paimon/core/bucket/hive_bucket_function.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,11 @@ uint32_t HiveBucketFunction::ComputeHash(const BinaryRow& row, int32_t field_ind
8686
switch (info.type) {
8787
case FieldType::BOOLEAN:
8888
return HiveHasher::HashInt(row.GetBoolean(field_index) ? 1 : 0);
89-
case FieldType::TINYINT:
90-
return HiveHasher::HashInt(static_cast<uint32_t>(row.GetByte(field_index)));
89+
case FieldType::TINYINT: {
90+
uint8_t byte = static_cast<uint8_t>(row.GetByte(field_index));
91+
int32_t signed_byte = byte < 128 ? byte : static_cast<int32_t>(byte) - 256;
92+
return HiveHasher::HashInt(static_cast<uint32_t>(signed_byte));
93+
}
9194
case FieldType::SMALLINT:
9295
return HiveHasher::HashInt(static_cast<uint32_t>(row.GetShort(field_index)));
9396
case FieldType::INT:

src/paimon/core/bucket/hive_hasher.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@ class HiveHasher {
4242
static uint32_t HashBytes(const char* bytes, int32_t length) {
4343
uint32_t result = 0;
4444
for (int32_t i = 0; i < length; i++) {
45-
result = result * 31U + static_cast<uint32_t>(static_cast<int32_t>(bytes[i]));
45+
uint8_t byte = static_cast<uint8_t>(bytes[i]);
46+
int32_t signed_byte = byte < 128 ? byte : static_cast<int32_t>(byte) - 256;
47+
result = result * 31U + static_cast<uint32_t>(signed_byte);
4648
}
4749
return result;
4850
}

0 commit comments

Comments
 (0)