|
40 | 40 |
|
41 | 41 | public class TypeUtil {
|
42 | 42 |
|
| 43 | + private static final int HEADER_SIZE = 12; |
| 44 | + |
43 | 45 | private TypeUtil() {}
|
44 | 46 |
|
45 | 47 | /**
|
@@ -452,6 +454,68 @@ private static void checkSchemaCompatibility(
|
452 | 454 | }
|
453 | 455 | }
|
454 | 456 |
|
| 457 | + /** |
| 458 | + * Estimates the number of bytes a value for a given field may occupy in memory. |
| 459 | + * |
| 460 | + * <p>This method approximates the memory size based on heuristics and the internal Java |
| 461 | + * representation defined by {@link Type.TypeID}. It is important to note that the actual size |
| 462 | + * might differ from this estimation. The method is designed to handle a variety of data types, |
| 463 | + * including primitive types, strings, and nested types such as structs, maps, and lists. |
| 464 | + * |
| 465 | + * @param field a field for which to estimate the size |
| 466 | + * @return the estimated size in bytes of the field's value in memory |
| 467 | + */ |
| 468 | + public static int estimateSize(Types.NestedField field) { |
| 469 | + return estimateSize(field.type()); |
| 470 | + } |
| 471 | + |
| 472 | + private static int estimateSize(Type type) { |
| 473 | + switch (type.typeId()) { |
| 474 | + case BOOLEAN: |
| 475 | + // the size of a boolean variable is virtual machine dependent |
| 476 | + // it is common to believe booleans occupy 1 byte in most JVMs |
| 477 | + return 1; |
| 478 | + case INTEGER: |
| 479 | + case FLOAT: |
| 480 | + case DATE: |
| 481 | + // ints and floats occupy 4 bytes |
| 482 | + // dates are internally represented as ints |
| 483 | + return 4; |
| 484 | + case LONG: |
| 485 | + case DOUBLE: |
| 486 | + case TIME: |
| 487 | + case TIMESTAMP: |
| 488 | + // longs and doubles occupy 8 bytes |
| 489 | + // times and timestamps are internally represented as longs |
| 490 | + return 8; |
| 491 | + case STRING: |
| 492 | + // 12 (header) + 6 (fields) + 16 (array overhead) + 20 (10 chars, 2 bytes each) = 54 bytes |
| 493 | + return 54; |
| 494 | + case UUID: |
| 495 | + // 12 (header) + 16 (two long variables) = 28 bytes |
| 496 | + return 28; |
| 497 | + case FIXED: |
| 498 | + return ((Types.FixedType) type).length(); |
| 499 | + case BINARY: |
| 500 | + return 80; |
| 501 | + case DECIMAL: |
| 502 | + // 12 (header) + (12 + 12 + 4) (BigInteger) + 4 (scale) = 44 bytes |
| 503 | + return 44; |
| 504 | + case STRUCT: |
| 505 | + Types.StructType struct = (Types.StructType) type; |
| 506 | + return HEADER_SIZE + struct.fields().stream().mapToInt(TypeUtil::estimateSize).sum(); |
| 507 | + case LIST: |
| 508 | + Types.ListType list = (Types.ListType) type; |
| 509 | + return HEADER_SIZE + 5 * estimateSize(list.elementType()); |
| 510 | + case MAP: |
| 511 | + Types.MapType map = (Types.MapType) type; |
| 512 | + int entrySize = HEADER_SIZE + estimateSize(map.keyType()) + estimateSize(map.valueType()); |
| 513 | + return HEADER_SIZE + 5 * entrySize; |
| 514 | + default: |
| 515 | + return 16; |
| 516 | + } |
| 517 | + } |
| 518 | + |
455 | 519 | /** Interface for passing a function that assigns column IDs. */
|
456 | 520 | public interface NextID {
|
457 | 521 | int get();
|
|
0 commit comments