diff --git a/flink-cdc-common-2.x/pom.xml b/flink-cdc-common-2.x/pom.xml new file mode 100644 index 00000000000..45743a5c430 --- /dev/null +++ b/flink-cdc-common-2.x/pom.xml @@ -0,0 +1,59 @@ + + + + + flink-cdc-parent + org.apache.flink + ${revision} + + 4.0.0 + + flink-cdc-common-2.x + + + ${flink.2.x.version} + ${flink.2.x.shaded.guava.version} + + + + + org.apache.flink + flink-cdc-common + ${project.version} + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + test-jar + + test-jar + + + + + + + \ No newline at end of file diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/configuration/description/TextElement.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/configuration/description/TextElement.java new file mode 100644 index 00000000000..a9bd5e845e0 --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/configuration/description/TextElement.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.configuration.description; + +import org.apache.flink.cdc.common.annotation.PublicEvolving; + +import org.apache.flink.shaded.guava33.com.google.common.base.Strings; + +import java.util.Arrays; +import java.util.Collections; +import java.util.EnumSet; +import java.util.List; + +/** Represents a text block in the {@link Description}. */ +@PublicEvolving +public class TextElement implements BlockElement, InlineElement { + private final String format; + private final List elements; + private final EnumSet textStyles = EnumSet.noneOf(TextStyle.class); + + /** + * Creates a block of text with placeholders ("%s") that will be replaced with proper string + * representation of given {@link InlineElement}. For example: + * + *

{@code text("This is a text with a link %s", link("https://somepage", "to here"))} + * + * @param format text with placeholders for elements + * @param elements elements to be put in the text + * @return block of text + */ + public static TextElement text(String format, InlineElement... elements) { + return new TextElement(format, Arrays.asList(elements)); + } + + /** + * Creates a simple block of text. + * + * @param text a simple block of text + * @return block of text + */ + public static TextElement text(String text) { + return new TextElement(text, Collections.emptyList()); + } + + /** Wraps a list of {@link InlineElement}s into a single {@link TextElement}. */ + public static InlineElement wrap(InlineElement... elements) { + return text(Strings.repeat("%s", elements.length), elements); + } + + /** + * Creates a block of text formatted as code. + * + * @param text a block of text that will be formatted as code + * @return block of text formatted as code + */ + public static TextElement code(String text) { + TextElement element = text(text); + element.textStyles.add(TextStyle.CODE); + return element; + } + + public String getFormat() { + return format; + } + + public List getElements() { + return elements; + } + + public EnumSet getStyles() { + return textStyles; + } + + private TextElement(String format, List elements) { + this.format = format; + this.elements = elements; + } + + @Override + public void format(Formatter formatter) { + formatter.format(this); + } + + /** Styles that can be applied to {@link TextElement} e.g. code, bold etc. */ + @PublicEvolving + public enum TextStyle { + CODE + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinaryArrayData.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinaryArrayData.java new file mode 100644 index 00000000000..991085ae226 --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinaryArrayData.java @@ -0,0 +1,623 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.data.binary; + +import org.apache.flink.cdc.common.data.ArrayData; +import org.apache.flink.cdc.common.data.DecimalData; +import org.apache.flink.cdc.common.data.LocalZonedTimestampData; +import org.apache.flink.cdc.common.data.MapData; +import org.apache.flink.cdc.common.data.RecordData; +import org.apache.flink.cdc.common.data.StringData; +import org.apache.flink.cdc.common.data.TimestampData; +import org.apache.flink.cdc.common.data.ZonedTimestampData; +import org.apache.flink.cdc.common.types.DataType; +import org.apache.flink.cdc.common.types.utils.DataTypeUtils; +import org.apache.flink.core.memory.MemorySegment; +import org.apache.flink.core.memory.MemorySegmentFactory; + +import java.lang.reflect.Array; + +import static org.apache.flink.core.memory.MemoryUtils.UNSAFE; + +/** + * A binary implementation of {@link ArrayData} which is backed by {@link MemorySegment}s. + * + *

This class provides a way to store array data in a binary format that is compact and + * efficient. It uses {@link MemorySegment}s to manage the binary representation of the data, + * allowing for efficient storage and access. + * + *

The binary layout of {@link BinaryArrayData} is structured as follows: + * + *

+ * [size(int)] + [null bits(4-byte word boundaries)] + [values or offset&length] + [variable length part].
+ * 
+ * + * + * + *

The header size is calculated based on the number of elements in the array, ensuring efficient + * alignment and access. + * + *

For fields that hold fixed-length primitive types, such as long, double, or int, they are + * stored compactly in bytes, just like the original Java array. + * + *

The class also provides methods to convert the binary data back into Java primitive arrays, + * handling various types such as boolean, byte, short, int, long, float, and double. + */ +public final class BinaryArrayData extends BinarySection implements ArrayData { + + /** Offset for Arrays. */ + private static final int BYTE_ARRAY_BASE_OFFSET = UNSAFE.arrayBaseOffset(byte[].class); + + private static final int BOOLEAN_ARRAY_OFFSET = UNSAFE.arrayBaseOffset(boolean[].class); + private static final int SHORT_ARRAY_OFFSET = UNSAFE.arrayBaseOffset(short[].class); + private static final int INT_ARRAY_OFFSET = UNSAFE.arrayBaseOffset(int[].class); + private static final int LONG_ARRAY_OFFSET = UNSAFE.arrayBaseOffset(long[].class); + private static final int FLOAT_ARRAY_OFFSET = UNSAFE.arrayBaseOffset(float[].class); + private static final int DOUBLE_ARRAY_OFFSET = UNSAFE.arrayBaseOffset(double[].class); + + /** + * Calculates the size of the header in bytes for an array with the specified number of fields. + * + *

The header consists of: + * + *

+ * + *

The size of the bitmap is determined by the number of elements in the array: + * + *

+ * + *

The formula for calculating the size of the header is: + * + *

+     *   header size = 4 bytes (for array size) + ((numFields + 31) / 32) * 4 bytes (for null bitmap)
+     * 
+ * + * @param numFields the number of elements in the array + * @return the size of the header in bytes + */ + public static int calculateHeaderInBytes(int numFields) { + return 4 + ((numFields + 31) / 32) * 4; + } + + /** + * It store real value when type is primitive. It store the length and offset of variable-length + * part when type is string, map, etc. + */ + public static int calculateFixLengthPartSize(DataType type) { + // ordered by type root definition + switch (type.getTypeRoot()) { + case BOOLEAN: + case TINYINT: + return 1; + case CHAR: + case VARCHAR: + case BINARY: + case VARBINARY: + case DECIMAL: + case BIGINT: + case DOUBLE: + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + case ARRAY: + case MAP: + case ROW: + // long and double are 8 bytes; + // otherwise it stores the length and offset of the variable-length part for types + // such as is string, map, etc. + return 8; + case TIMESTAMP_WITH_TIME_ZONE: + throw new UnsupportedOperationException(); + case SMALLINT: + return 2; + case INTEGER: + case FLOAT: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + return 4; + default: + throw new IllegalArgumentException(); + } + } + + // The number of elements in this array + private int size; + + /** The position to start storing array elements. */ + private int elementOffset; + + public BinaryArrayData() {} + + private void assertIndexIsValid(int index) { + assert index >= 0 : "index (" + index + ") should >= 0"; + assert index < size : "index (" + index + ") should < " + size; + } + + private int getElementOffset(int ordinal, int elementSize) { + return elementOffset + ordinal * elementSize; + } + + @Override + public int size() { + return size; + } + + @Override + public void pointTo(MemorySegment[] segments, int offset, int sizeInBytes) { + // Read the number of elements from the first 4 bytes. + final int size = BinarySegmentUtils.getInt(segments, offset); + assert size >= 0 : "size (" + size + ") should >= 0"; + + this.size = size; + super.pointTo(segments, offset, sizeInBytes); + this.elementOffset = offset + calculateHeaderInBytes(this.size); + } + + @Override + public boolean isNullAt(int pos) { + assertIndexIsValid(pos); + return BinarySegmentUtils.bitGet(segments, offset + 4, pos); + } + + public void setNullAt(int pos) { + assertIndexIsValid(pos); + BinarySegmentUtils.bitSet(segments, offset + 4, pos); + } + + public void setNotNullAt(int pos) { + assertIndexIsValid(pos); + BinarySegmentUtils.bitUnSet(segments, offset + 4, pos); + } + + @Override + public long getLong(int pos) { + assertIndexIsValid(pos); + return BinarySegmentUtils.getLong(segments, getElementOffset(pos, 8)); + } + + public void setLong(int pos, long value) { + assertIndexIsValid(pos); + setNotNullAt(pos); + BinarySegmentUtils.setLong(segments, getElementOffset(pos, 8), value); + } + + public void setNullLong(int pos) { + assertIndexIsValid(pos); + BinarySegmentUtils.bitSet(segments, offset + 4, pos); + BinarySegmentUtils.setLong(segments, getElementOffset(pos, 8), 0L); + } + + @Override + public int getInt(int pos) { + assertIndexIsValid(pos); + return BinarySegmentUtils.getInt(segments, getElementOffset(pos, 4)); + } + + public void setInt(int pos, int value) { + assertIndexIsValid(pos); + setNotNullAt(pos); + BinarySegmentUtils.setInt(segments, getElementOffset(pos, 4), value); + } + + public void setNullInt(int pos) { + assertIndexIsValid(pos); + BinarySegmentUtils.bitSet(segments, offset + 4, pos); + BinarySegmentUtils.setInt(segments, getElementOffset(pos, 4), 0); + } + + @Override + public StringData getString(int pos) { + assertIndexIsValid(pos); + int fieldOffset = getElementOffset(pos, 8); + final long offsetAndSize = BinarySegmentUtils.getLong(segments, fieldOffset); + return BinarySegmentUtils.readStringData(segments, offset, fieldOffset, offsetAndSize); + } + + @Override + public DecimalData getDecimal(int pos, int precision, int scale) { + assertIndexIsValid(pos); + if (DecimalData.isCompact(precision)) { + return DecimalData.fromUnscaledLong( + BinarySegmentUtils.getLong(segments, getElementOffset(pos, 8)), + precision, + scale); + } + + int fieldOffset = getElementOffset(pos, 8); + final long offsetAndSize = BinarySegmentUtils.getLong(segments, fieldOffset); + return BinarySegmentUtils.readDecimalData( + segments, offset, offsetAndSize, precision, scale); + } + + @Override + public TimestampData getTimestamp(int pos, int precision) { + assertIndexIsValid(pos); + + if (TimestampData.isCompact(precision)) { + return TimestampData.fromMillis( + BinarySegmentUtils.getLong(segments, getElementOffset(pos, 8))); + } + + int fieldOffset = getElementOffset(pos, 8); + final long offsetAndNanoOfMilli = BinarySegmentUtils.getLong(segments, fieldOffset); + return BinarySegmentUtils.readTimestampData(segments, offset, offsetAndNanoOfMilli); + } + + @Override + public LocalZonedTimestampData getLocalZonedTimestamp(int pos, int precision) { + throw new UnsupportedOperationException("Not support LocalZonedTimestampData"); + } + + @Override + public ZonedTimestampData getZonedTimestamp(int pos, int precision) { + throw new UnsupportedOperationException("Not support ZonedTimestampData"); + } + + @Override + public byte[] getBinary(int pos) { + assertIndexIsValid(pos); + int fieldOffset = getElementOffset(pos, 8); + final long offsetAndSize = BinarySegmentUtils.getLong(segments, fieldOffset); + return BinarySegmentUtils.readBinary(segments, offset, fieldOffset, offsetAndSize); + } + + @Override + public ArrayData getArray(int pos) { + assertIndexIsValid(pos); + return BinarySegmentUtils.readArrayData(segments, offset, getLong(pos)); + } + + @Override + public MapData getMap(int pos) { + assertIndexIsValid(pos); + return BinarySegmentUtils.readMapData(segments, offset, getLong(pos)); + } + + @Override + public RecordData getRecord(int pos, int numFields) { + assertIndexIsValid(pos); + int fieldOffset = getElementOffset(pos, 8); + final long offsetAndSize = BinarySegmentUtils.getLong(segments, fieldOffset); + return BinarySegmentUtils.readRecordData(segments, numFields, offset, offsetAndSize); + } + + @Override + public boolean getBoolean(int pos) { + assertIndexIsValid(pos); + return BinarySegmentUtils.getBoolean(segments, getElementOffset(pos, 1)); + } + + public void setBoolean(int pos, boolean value) { + assertIndexIsValid(pos); + setNotNullAt(pos); + BinarySegmentUtils.setBoolean(segments, getElementOffset(pos, 1), value); + } + + public void setNullBoolean(int pos) { + assertIndexIsValid(pos); + BinarySegmentUtils.bitSet(segments, offset + 4, pos); + BinarySegmentUtils.setBoolean(segments, getElementOffset(pos, 1), false); + } + + @Override + public byte getByte(int pos) { + assertIndexIsValid(pos); + return BinarySegmentUtils.getByte(segments, getElementOffset(pos, 1)); + } + + public void setByte(int pos, byte value) { + assertIndexIsValid(pos); + setNotNullAt(pos); + BinarySegmentUtils.setByte(segments, getElementOffset(pos, 1), value); + } + + public void setNullByte(int pos) { + assertIndexIsValid(pos); + BinarySegmentUtils.bitSet(segments, offset + 4, pos); + BinarySegmentUtils.setByte(segments, getElementOffset(pos, 1), (byte) 0); + } + + @Override + public short getShort(int pos) { + assertIndexIsValid(pos); + return BinarySegmentUtils.getShort(segments, getElementOffset(pos, 2)); + } + + public void setShort(int pos, short value) { + assertIndexIsValid(pos); + setNotNullAt(pos); + BinarySegmentUtils.setShort(segments, getElementOffset(pos, 2), value); + } + + public void setNullShort(int pos) { + assertIndexIsValid(pos); + BinarySegmentUtils.bitSet(segments, offset + 4, pos); + BinarySegmentUtils.setShort(segments, getElementOffset(pos, 2), (short) 0); + } + + @Override + public float getFloat(int pos) { + assertIndexIsValid(pos); + return BinarySegmentUtils.getFloat(segments, getElementOffset(pos, 4)); + } + + public void setFloat(int pos, float value) { + assertIndexIsValid(pos); + setNotNullAt(pos); + BinarySegmentUtils.setFloat(segments, getElementOffset(pos, 4), value); + } + + public void setNullFloat(int pos) { + assertIndexIsValid(pos); + BinarySegmentUtils.bitSet(segments, offset + 4, pos); + BinarySegmentUtils.setFloat(segments, getElementOffset(pos, 4), 0F); + } + + @Override + public double getDouble(int pos) { + assertIndexIsValid(pos); + return BinarySegmentUtils.getDouble(segments, getElementOffset(pos, 8)); + } + + public void setDouble(int pos, double value) { + assertIndexIsValid(pos); + setNotNullAt(pos); + BinarySegmentUtils.setDouble(segments, getElementOffset(pos, 8), value); + } + + public void setNullDouble(int pos) { + assertIndexIsValid(pos); + BinarySegmentUtils.bitSet(segments, offset + 4, pos); + BinarySegmentUtils.setDouble(segments, getElementOffset(pos, 8), 0.0); + } + + public void setDecimal(int pos, DecimalData value, int precision) { + assertIndexIsValid(pos); + + if (DecimalData.isCompact(precision)) { + // compact format + setLong(pos, value.toUnscaledLong()); + } else { + int fieldOffset = getElementOffset(pos, 8); + int cursor = (int) (BinarySegmentUtils.getLong(segments, fieldOffset) >>> 32); + assert cursor > 0 : "invalid cursor " + cursor; + // zero-out the bytes + BinarySegmentUtils.setLong(segments, offset + cursor, 0L); + BinarySegmentUtils.setLong(segments, offset + cursor + 8, 0L); + + if (value == null) { + setNullAt(pos); + // keep the offset for future update + BinarySegmentUtils.setLong(segments, fieldOffset, ((long) cursor) << 32); + } else { + + byte[] bytes = value.toUnscaledBytes(); + assert (bytes.length <= 16); + + // Write the bytes to the variable length portion. + BinarySegmentUtils.copyFromBytes(segments, offset + cursor, bytes, 0, bytes.length); + setLong(pos, ((long) cursor << 32) | ((long) bytes.length)); + } + } + } + + public void setTimestamp(int pos, TimestampData value, int precision) { + assertIndexIsValid(pos); + + if (TimestampData.isCompact(precision)) { + setLong(pos, value.getMillisecond()); + } else { + int fieldOffset = getElementOffset(pos, 8); + int cursor = (int) (BinarySegmentUtils.getLong(segments, fieldOffset) >>> 32); + assert cursor > 0 : "invalid cursor " + cursor; + + if (value == null) { + setNullAt(pos); + // zero-out the bytes + BinarySegmentUtils.setLong(segments, offset + cursor, 0L); + // keep the offset for future update + BinarySegmentUtils.setLong(segments, fieldOffset, ((long) cursor) << 32); + } else { + // write millisecond to the variable length portion. + BinarySegmentUtils.setLong(segments, offset + cursor, value.getMillisecond()); + // write nanoOfMillisecond to the fixed-length portion. + setLong(pos, ((long) cursor << 32) | (long) value.getNanoOfMillisecond()); + } + } + } + + public boolean anyNull() { + for (int i = offset + 4; i < elementOffset; i += 4) { + if (BinarySegmentUtils.getInt(segments, i) != 0) { + return true; + } + } + return false; + } + + private void checkNoNull() { + if (anyNull()) { + throw new RuntimeException("Primitive array must not contain a null value."); + } + } + + @Override + public boolean[] toBooleanArray() { + checkNoNull(); + boolean[] values = new boolean[size]; + BinarySegmentUtils.copyToUnsafe( + segments, elementOffset, values, BOOLEAN_ARRAY_OFFSET, size); + return values; + } + + @Override + public byte[] toByteArray() { + checkNoNull(); + byte[] values = new byte[size]; + BinarySegmentUtils.copyToUnsafe( + segments, elementOffset, values, BYTE_ARRAY_BASE_OFFSET, size); + return values; + } + + @Override + public short[] toShortArray() { + checkNoNull(); + short[] values = new short[size]; + BinarySegmentUtils.copyToUnsafe( + segments, elementOffset, values, SHORT_ARRAY_OFFSET, size * 2); + return values; + } + + @Override + public int[] toIntArray() { + checkNoNull(); + int[] values = new int[size]; + BinarySegmentUtils.copyToUnsafe( + segments, elementOffset, values, INT_ARRAY_OFFSET, size * 4); + return values; + } + + @Override + public long[] toLongArray() { + checkNoNull(); + long[] values = new long[size]; + BinarySegmentUtils.copyToUnsafe( + segments, elementOffset, values, LONG_ARRAY_OFFSET, size * 8); + return values; + } + + @Override + public float[] toFloatArray() { + checkNoNull(); + float[] values = new float[size]; + BinarySegmentUtils.copyToUnsafe( + segments, elementOffset, values, FLOAT_ARRAY_OFFSET, size * 4); + return values; + } + + @Override + public double[] toDoubleArray() { + checkNoNull(); + double[] values = new double[size]; + BinarySegmentUtils.copyToUnsafe( + segments, elementOffset, values, DOUBLE_ARRAY_OFFSET, size * 8); + return values; + } + + @SuppressWarnings("unchecked") + public T[] toObjectArray(DataType elementType) { + Class elementClass = (Class) DataTypeUtils.toInternalConversionClass(elementType); + ArrayData.ElementGetter elementGetter = ArrayData.createElementGetter(elementType); + T[] values = (T[]) Array.newInstance(elementClass, size); + for (int i = 0; i < size; i++) { + if (!isNullAt(i)) { + values[i] = (T) elementGetter.getElementOrNull(this, i); + } + } + return values; + } + + public BinaryArrayData copy() { + return copy(new BinaryArrayData()); + } + + public BinaryArrayData copy(BinaryArrayData reuse) { + byte[] bytes = BinarySegmentUtils.copyToBytes(segments, offset, sizeInBytes); + reuse.pointTo(MemorySegmentFactory.wrap(bytes), 0, sizeInBytes); + return reuse; + } + + @Override + public int hashCode() { + return BinarySegmentUtils.hashByWords(segments, offset, sizeInBytes); + } + + // ------------------------------------------------------------------------------------------ + // Construction Utilities + // ------------------------------------------------------------------------------------------ + + public static BinaryArrayData fromPrimitiveArray(boolean[] arr) { + return fromPrimitiveArray(arr, BOOLEAN_ARRAY_OFFSET, arr.length, 1); + } + + public static BinaryArrayData fromPrimitiveArray(byte[] arr) { + return fromPrimitiveArray(arr, BYTE_ARRAY_BASE_OFFSET, arr.length, 1); + } + + public static BinaryArrayData fromPrimitiveArray(short[] arr) { + return fromPrimitiveArray(arr, SHORT_ARRAY_OFFSET, arr.length, 2); + } + + public static BinaryArrayData fromPrimitiveArray(int[] arr) { + return fromPrimitiveArray(arr, INT_ARRAY_OFFSET, arr.length, 4); + } + + public static BinaryArrayData fromPrimitiveArray(long[] arr) { + return fromPrimitiveArray(arr, LONG_ARRAY_OFFSET, arr.length, 8); + } + + public static BinaryArrayData fromPrimitiveArray(float[] arr) { + return fromPrimitiveArray(arr, FLOAT_ARRAY_OFFSET, arr.length, 4); + } + + public static BinaryArrayData fromPrimitiveArray(double[] arr) { + return fromPrimitiveArray(arr, DOUBLE_ARRAY_OFFSET, arr.length, 8); + } + + private static BinaryArrayData fromPrimitiveArray( + Object arr, int offset, int length, int elementSize) { + final long headerInBytes = calculateHeaderInBytes(length); + final long valueRegionInBytes = elementSize * length; + + // must align by 8 bytes + long totalSizeInLongs = (headerInBytes + valueRegionInBytes + 7) / 8; + if (totalSizeInLongs > Integer.MAX_VALUE / 8) { + throw new UnsupportedOperationException( + "Cannot convert this array to unsafe format as " + "it's too big."); + } + long totalSize = totalSizeInLongs * 8; + + final byte[] data = new byte[(int) totalSize]; + + UNSAFE.putInt(data, (long) BYTE_ARRAY_BASE_OFFSET, length); + UNSAFE.copyMemory( + arr, offset, data, BYTE_ARRAY_BASE_OFFSET + headerInBytes, valueRegionInBytes); + + BinaryArrayData result = new BinaryArrayData(); + result.pointTo(MemorySegmentFactory.wrap(data), 0, (int) totalSize); + return result; + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinaryFormat.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinaryFormat.java new file mode 100644 index 00000000000..8bed6c4a530 --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinaryFormat.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.data.binary; + +import org.apache.flink.cdc.common.annotation.Internal; +import org.apache.flink.core.memory.MemorySegment; + +/** Binary format spanning {@link MemorySegment}s. */ +@Internal +public interface BinaryFormat { + + /** + * It decides whether to put data in FixLenPart or VarLenPart. See more in {@link + * BinaryRecordData}. + * + *

If len is less than 8, its binary format is: 1-bit mark(1) = 1, 7-bits len, and 7-bytes + * data. Data is stored in fix-length part. + * + *

If len is greater or equal to 8, its binary format is: 1-bit mark(1) = 0, 31-bits offset + * to the data, and 4-bytes length of data. Data is stored in variable-length part. + */ + int MAX_FIX_PART_DATA_SIZE = 7; + + /** + * To get the mark in highest bit of long. Form: 10000000 00000000 ... (8 bytes) + * + *

This is used to decide whether the data is stored in fixed-length part or variable-length + * part. see {@link #MAX_FIX_PART_DATA_SIZE} for more information. + */ + long HIGHEST_FIRST_BIT = 0x80L << 56; + + /** + * To get the 7 bits length in second bit to eighth bit out of a long. Form: 01111111 00000000 + * ... (8 bytes) + * + *

This is used to get the length of the data which is stored in this long. see {@link + * #MAX_FIX_PART_DATA_SIZE} for more information. + */ + long HIGHEST_SECOND_TO_EIGHTH_BIT = 0x7FL << 56; + + /** Gets the underlying {@link MemorySegment}s this binary format spans. */ + MemorySegment[] getSegments(); + + /** Gets the start offset of this binary data in the {@link MemorySegment}s. */ + int getOffset(); + + /** Gets the size in bytes of this binary data. */ + int getSizeInBytes(); +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinaryMapData.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinaryMapData.java new file mode 100644 index 00000000000..98cc19d341a --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinaryMapData.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.data.binary; + +import org.apache.flink.cdc.common.annotation.Internal; +import org.apache.flink.cdc.common.data.MapData; +import org.apache.flink.cdc.common.types.DataType; +import org.apache.flink.core.memory.MemorySegment; +import org.apache.flink.core.memory.MemorySegmentFactory; + +import java.util.HashMap; +import java.util.Map; + +import static org.apache.flink.cdc.common.utils.Preconditions.checkArgument; + +/** + * [4 byte(keyArray size in bytes)] + [Key BinaryArray] + [Value BinaryArray]. + * + *

{@code BinaryMap} are influenced by Apache Spark UnsafeMapData. + */ +@Internal +public class BinaryMapData extends BinarySection implements MapData { + private final BinaryArrayData keys; + private final BinaryArrayData values; + + public BinaryMapData() { + keys = new BinaryArrayData(); + values = new BinaryArrayData(); + } + + public int size() { + return keys.size(); + } + + @Override + public void pointTo(MemorySegment[] segments, int offset, int sizeInBytes) { + // Read the numBytes of key array from the first 4 bytes. + final int keyArrayBytes = BinarySegmentUtils.getInt(segments, offset); + assert keyArrayBytes >= 0 : "keyArraySize (" + keyArrayBytes + ") should >= 0"; + final int valueArrayBytes = sizeInBytes - keyArrayBytes - 4; + assert valueArrayBytes >= 0 : "valueArraySize (" + valueArrayBytes + ") should >= 0"; + + keys.pointTo(segments, offset + 4, keyArrayBytes); + values.pointTo(segments, offset + 4 + keyArrayBytes, valueArrayBytes); + + assert keys.size() == values.size(); + + this.segments = segments; + this.offset = offset; + this.sizeInBytes = sizeInBytes; + } + + public BinaryArrayData keyArray() { + return keys; + } + + public BinaryArrayData valueArray() { + return values; + } + + public Map toJavaMap(DataType keyType, DataType valueType) { + Object[] keyArray = keys.toObjectArray(keyType); + Object[] valueArray = values.toObjectArray(valueType); + + Map map = new HashMap<>(); + for (int i = 0; i < keyArray.length; i++) { + map.put(keyArray[i], valueArray[i]); + } + return map; + } + + public BinaryMapData copy() { + return copy(new BinaryMapData()); + } + + public BinaryMapData copy(BinaryMapData reuse) { + byte[] bytes = BinarySegmentUtils.copyToBytes(segments, offset, sizeInBytes); + reuse.pointTo(MemorySegmentFactory.wrap(bytes), 0, sizeInBytes); + return reuse; + } + + @Override + public int hashCode() { + return BinarySegmentUtils.hashByWords(segments, offset, sizeInBytes); + } + + // ------------------------------------------------------------------------------------------ + // Construction Utilities + // ------------------------------------------------------------------------------------------ + + public static BinaryMapData valueOf(BinaryArrayData key, BinaryArrayData value) { + checkArgument(key.segments.length == 1 && value.getSegments().length == 1); + byte[] bytes = new byte[4 + key.sizeInBytes + value.sizeInBytes]; + MemorySegment segment = MemorySegmentFactory.wrap(bytes); + segment.putInt(0, key.sizeInBytes); + key.getSegments()[0].copyTo(key.getOffset(), segment, 4, key.sizeInBytes); + value.getSegments()[0].copyTo( + value.getOffset(), segment, 4 + key.sizeInBytes, value.sizeInBytes); + BinaryMapData map = new BinaryMapData(); + map.pointTo(segment, 0, bytes.length); + return map; + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinaryRecordData.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinaryRecordData.java new file mode 100644 index 00000000000..bc645eb7db4 --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinaryRecordData.java @@ -0,0 +1,308 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.data.binary; + +import org.apache.flink.cdc.common.annotation.Internal; +import org.apache.flink.cdc.common.data.ArrayData; +import org.apache.flink.cdc.common.data.DateData; +import org.apache.flink.cdc.common.data.DecimalData; +import org.apache.flink.cdc.common.data.LocalZonedTimestampData; +import org.apache.flink.cdc.common.data.MapData; +import org.apache.flink.cdc.common.data.RecordData; +import org.apache.flink.cdc.common.data.StringData; +import org.apache.flink.cdc.common.data.TimeData; +import org.apache.flink.cdc.common.data.TimestampData; +import org.apache.flink.cdc.common.data.ZonedTimestampData; +import org.apache.flink.cdc.common.types.variant.BinaryVariant; +import org.apache.flink.cdc.common.utils.Preconditions; +import org.apache.flink.core.memory.MemorySegment; +import org.apache.flink.core.memory.MemorySegmentFactory; + +import java.nio.ByteOrder; + +/** + * An implementation of {@link RecordData} which is backed by {@link MemorySegment} instead of + * Object. It can significantly reduce the serialization/deserialization of Java objects. + * + *

A BinaryRecordData has two part: Fixed-length part and variable-length part. + * + *

Fixed-length part contains 1 byte header and null bit set and field values. Null bit set is + * used for null tracking and is aligned to 8-byte word boundaries. `Field values` holds + * fixed-length primitive types and variable-length values which can be stored in 8 bytes inside. If + * it do not fit the variable-length field, then store the length and offset of variable-length + * part. + * + *

Fixed-length part will certainly fall into a MemorySegment, which will speed up the read and + * write of field. During the write phase, if the target memory segment has less space than fixed + * length part size, we will skip the space. So the number of fields in a single Row cannot exceed + * the capacity of a single MemorySegment, if there are too many fields, we suggest that user set a + * bigger pageSize of MemorySegment. + * + *

Variable-length part may fall into multiple MemorySegments. + */ +@Internal +public final class BinaryRecordData extends BinarySection implements RecordData, NullAwareGetters { + + public static final boolean LITTLE_ENDIAN = + (ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN); + private static final long FIRST_BYTE_ZERO = LITTLE_ENDIAN ? ~0xFFL : ~(0xFFL << 56L); + public static final int HEADER_SIZE_IN_BITS = 8; + + public static final String TIMESTAMP_DELIMITER = "//"; + + public static int calculateBitSetWidthInBytes(int arity) { + return ((arity + 63 + HEADER_SIZE_IN_BITS) / 64) * 8; + } + + private final int arity; + private final int nullBitsSizeInBytes; + + public BinaryRecordData(int arity) { + Preconditions.checkArgument(arity >= 0); + this.arity = arity; + this.nullBitsSizeInBytes = calculateBitSetWidthInBytes(arity); + } + + private int getFieldOffset(int pos) { + return offset + nullBitsSizeInBytes + pos * 8; + } + + private void assertIndexIsValid(int index) { + assert index >= 0 : "index (" + index + ") should >= 0"; + assert index < arity : "index (" + index + ") should < " + arity; + } + + public int getFixedLengthPartSize() { + return nullBitsSizeInBytes + 8 * arity; + } + + @Override + public int getArity() { + return arity; + } + + @Override + public boolean isNullAt(int pos) { + assertIndexIsValid(pos); + return BinarySegmentUtils.bitGet(segments[0], offset, pos + HEADER_SIZE_IN_BITS); + } + + @Override + public boolean getBoolean(int pos) { + assertIndexIsValid(pos); + return segments[0].getBoolean(getFieldOffset(pos)); + } + + @Override + public byte getByte(int pos) { + assertIndexIsValid(pos); + return segments[0].get(getFieldOffset(pos)); + } + + @Override + public short getShort(int pos) { + assertIndexIsValid(pos); + return segments[0].getShort(getFieldOffset(pos)); + } + + @Override + public int getInt(int pos) { + assertIndexIsValid(pos); + return segments[0].getInt(getFieldOffset(pos)); + } + + @Override + public long getLong(int pos) { + assertIndexIsValid(pos); + return segments[0].getLong(getFieldOffset(pos)); + } + + @Override + public float getFloat(int pos) { + assertIndexIsValid(pos); + return segments[0].getFloat(getFieldOffset(pos)); + } + + @Override + public double getDouble(int pos) { + assertIndexIsValid(pos); + return segments[0].getDouble(getFieldOffset(pos)); + } + + @Override + public StringData getString(int pos) { + assertIndexIsValid(pos); + int fieldOffset = getFieldOffset(pos); + final long offsetAndLen = segments[0].getLong(fieldOffset); + return BinarySegmentUtils.readStringData(segments, offset, fieldOffset, offsetAndLen); + } + + @Override + public DecimalData getDecimal(int pos, int precision, int scale) { + assertIndexIsValid(pos); + + if (DecimalData.isCompact(precision)) { + return DecimalData.fromUnscaledLong( + segments[0].getLong(getFieldOffset(pos)), precision, scale); + } + + int fieldOffset = getFieldOffset(pos); + final long offsetAndSize = segments[0].getLong(fieldOffset); + return BinarySegmentUtils.readDecimalData( + segments, offset, offsetAndSize, precision, scale); + } + + @Override + public TimestampData getTimestamp(int pos, int precision) { + assertIndexIsValid(pos); + + int fieldOffset = getFieldOffset(pos); + final long offsetAndNanoOfMilli = segments[0].getLong(fieldOffset); + return BinarySegmentUtils.readTimestampData(segments, offset, offsetAndNanoOfMilli); + } + + @Override + public ZonedTimestampData getZonedTimestamp(int pos, int precision) { + String[] parts = getString(pos).toString().split(TIMESTAMP_DELIMITER); + return ZonedTimestampData.of( + Long.parseLong(parts[0]), Integer.parseInt(parts[1]), parts[2]); + } + + @Override + public LocalZonedTimestampData getLocalZonedTimestampData(int pos, int precision) { + assertIndexIsValid(pos); + + int fieldOffset = getFieldOffset(pos); + final long offsetAndNanoOfMilli = segments[0].getLong(fieldOffset); + return BinarySegmentUtils.readLocalZonedTimestampData( + segments, offset, offsetAndNanoOfMilli); + } + + @Override + public byte[] getBinary(int pos) { + assertIndexIsValid(pos); + int fieldOffset = getFieldOffset(pos); + final long offsetAndLen = segments[0].getLong(fieldOffset); + return BinarySegmentUtils.readBinary(segments, offset, fieldOffset, offsetAndLen); + } + + @Override + public ArrayData getArray(int pos) { + assertIndexIsValid(pos); + return BinarySegmentUtils.readArrayData(segments, offset, getLong(pos)); + } + + @Override + public MapData getMap(int pos) { + assertIndexIsValid(pos); + return BinarySegmentUtils.readMapData(segments, offset, getLong(pos)); + } + + @Override + public RecordData getRow(int pos, int numFields) { + assertIndexIsValid(pos); + return BinarySegmentUtils.readRecordData(segments, numFields, offset, getLong(pos)); + } + + @Override + public DateData getDate(int pos) { + assertIndexIsValid(pos); + return DateData.fromEpochDay(getInt(pos)); + } + + @Override + public TimeData getTime(int pos) { + assertIndexIsValid(pos); + return TimeData.fromMillisOfDay(getInt(pos)); + } + + @Override + public BinaryVariant getVariant(int pos) { + assertIndexIsValid(pos); + return BinarySegmentUtils.readVariant(segments, offset, getLong(pos)); + } + + /** The bit is 1 when the field is null. Default is 0. */ + @Override + public boolean anyNull() { + // Skip the header. + if ((segments[0].getLong(0) & FIRST_BYTE_ZERO) != 0) { + return true; + } + for (int i = 8; i < nullBitsSizeInBytes; i += 8) { + if (segments[0].getLong(i) != 0) { + return true; + } + } + return false; + } + + @Override + public boolean anyNull(int[] fields) { + for (int field : fields) { + if (isNullAt(field)) { + return true; + } + } + return false; + } + + public BinaryRecordData copy() { + return copy(new BinaryRecordData(arity)); + } + + public BinaryRecordData copy(BinaryRecordData reuse) { + return copyInternal(reuse); + } + + private BinaryRecordData copyInternal(BinaryRecordData reuse) { + byte[] bytes = BinarySegmentUtils.copyToBytes(segments, offset, sizeInBytes); + reuse.pointTo(MemorySegmentFactory.wrap(bytes), 0, sizeInBytes); + return reuse; + } + + public void clear() { + segments = null; + offset = 0; + sizeInBytes = 0; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + // both BinaryRecordData and NestedRowData have the same memory format + if (!(o instanceof BinaryRecordData)) { + return false; + } + final BinarySection that = (BinarySection) o; + return sizeInBytes == that.sizeInBytes + && BinarySegmentUtils.equals( + segments, offset, that.segments, that.offset, sizeInBytes); + } + + @Override + public int hashCode() { + return BinarySegmentUtils.hashByWords(segments, offset, sizeInBytes); + } + + public void setTotalSize(int sizeInBytes) { + this.sizeInBytes = sizeInBytes; + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinarySection.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinarySection.java new file mode 100644 index 00000000000..799c91a4f96 --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinarySection.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.data.binary; + +import org.apache.flink.cdc.common.annotation.Internal; +import org.apache.flink.cdc.common.utils.Preconditions; +import org.apache.flink.core.memory.MemorySegment; + +/** A basic implementation of {@link BinaryFormat} which describe a section of memory. */ +@Internal +public class BinarySection implements BinaryFormat { + + protected MemorySegment[] segments; + protected int offset; + protected int sizeInBytes; + + public BinarySection() {} + + public BinarySection(MemorySegment[] segments, int offset, int sizeInBytes) { + Preconditions.checkArgument(segments != null); + this.segments = segments; + this.offset = offset; + this.sizeInBytes = sizeInBytes; + } + + public final void pointTo(MemorySegment segment, int offset, int sizeInBytes) { + pointTo(new MemorySegment[] {segment}, offset, sizeInBytes); + } + + public void pointTo(MemorySegment[] segments, int offset, int sizeInBytes) { + Preconditions.checkArgument(segments != null); + this.segments = segments; + this.offset = offset; + this.sizeInBytes = sizeInBytes; + } + + public MemorySegment[] getSegments() { + return segments; + } + + public int getOffset() { + return offset; + } + + public int getSizeInBytes() { + return sizeInBytes; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final BinarySection that = (BinarySection) o; + return sizeInBytes == that.sizeInBytes + && BinarySegmentUtils.equals( + segments, offset, that.segments, that.offset, sizeInBytes); + } + + @Override + public int hashCode() { + return BinarySegmentUtils.hash(segments, offset, sizeInBytes); + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinarySegmentUtils.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinarySegmentUtils.java new file mode 100644 index 00000000000..9c463106df0 --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinarySegmentUtils.java @@ -0,0 +1,1198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.data.binary; + +import org.apache.flink.cdc.common.annotation.Internal; +import org.apache.flink.cdc.common.data.ArrayData; +import org.apache.flink.cdc.common.data.DecimalData; +import org.apache.flink.cdc.common.data.LocalZonedTimestampData; +import org.apache.flink.cdc.common.data.MapData; +import org.apache.flink.cdc.common.data.RecordData; +import org.apache.flink.cdc.common.data.StringData; +import org.apache.flink.cdc.common.data.TimestampData; +import org.apache.flink.cdc.common.types.variant.BinaryVariant; +import org.apache.flink.core.memory.DataOutputView; +import org.apache.flink.core.memory.MemorySegment; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import static org.apache.flink.cdc.common.data.binary.BinaryFormat.HIGHEST_FIRST_BIT; +import static org.apache.flink.cdc.common.data.binary.BinaryFormat.HIGHEST_SECOND_TO_EIGHTH_BIT; +import static org.apache.flink.core.memory.MemoryUtils.UNSAFE; + +/** Utilities for binary data segments which heavily uses {@link MemorySegment}. */ +@Internal +public final class BinarySegmentUtils { + + /** Constant that flags the byte order. */ + public static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN; + + private static final int ADDRESS_BITS_PER_WORD = 3; + + private static final int BIT_BYTE_INDEX_MASK = 7; + + /** + * SQL execution threads is limited, not too many, so it can bear the overhead of 64K per + * thread. + */ + private static final int MAX_BYTES_LENGTH = 1024 * 64; + + private static final int MAX_CHARS_LENGTH = 1024 * 32; + + private static final int BYTE_ARRAY_BASE_OFFSET = UNSAFE.arrayBaseOffset(byte[].class); + + private static final ThreadLocal BYTES_LOCAL = new ThreadLocal<>(); + private static final ThreadLocal CHARS_LOCAL = new ThreadLocal<>(); + + private BinarySegmentUtils() { + // do not instantiate + } + + /** + * Allocate bytes that is only for temporary usage, it should not be stored in somewhere else. + * Use a {@link ThreadLocal} to reuse bytes to avoid overhead of byte[] new and gc. + * + *

If there are methods that can only accept a byte[], instead of a MemorySegment[] + * parameter, we can allocate a reuse bytes and copy the MemorySegment data to byte[], then call + * the method. Such as String deserialization. + */ + public static byte[] allocateReuseBytes(int length) { + byte[] bytes = BYTES_LOCAL.get(); + + if (bytes == null) { + if (length <= MAX_BYTES_LENGTH) { + bytes = new byte[MAX_BYTES_LENGTH]; + BYTES_LOCAL.set(bytes); + } else { + bytes = new byte[length]; + } + } else if (bytes.length < length) { + bytes = new byte[length]; + } + + return bytes; + } + + public static char[] allocateReuseChars(int length) { + char[] chars = CHARS_LOCAL.get(); + + if (chars == null) { + if (length <= MAX_CHARS_LENGTH) { + chars = new char[MAX_CHARS_LENGTH]; + CHARS_LOCAL.set(chars); + } else { + chars = new char[length]; + } + } else if (chars.length < length) { + chars = new char[length]; + } + + return chars; + } + + /** + * Copy segments to a new byte[]. + * + * @param segments Source segments. + * @param offset Source segments offset. + * @param numBytes the number bytes to copy. + */ + public static byte[] copyToBytes(MemorySegment[] segments, int offset, int numBytes) { + return copyToBytes(segments, offset, new byte[numBytes], 0, numBytes); + } + + /** + * Copy segments to target byte[]. + * + * @param segments Source segments. + * @param offset Source segments offset. + * @param bytes target byte[]. + * @param bytesOffset target byte[] offset. + * @param numBytes the number bytes to copy. + */ + public static byte[] copyToBytes( + MemorySegment[] segments, int offset, byte[] bytes, int bytesOffset, int numBytes) { + if (inFirstSegment(segments, offset, numBytes)) { + segments[0].get(offset, bytes, bytesOffset, numBytes); + } else { + copyMultiSegmentsToBytes(segments, offset, bytes, bytesOffset, numBytes); + } + return bytes; + } + + public static void copyMultiSegmentsToBytes( + MemorySegment[] segments, int offset, byte[] bytes, int bytesOffset, int numBytes) { + int remainSize = numBytes; + for (MemorySegment segment : segments) { + int remain = segment.size() - offset; + if (remain > 0) { + int nCopy = Math.min(remain, remainSize); + segment.get(offset, bytes, numBytes - remainSize + bytesOffset, nCopy); + remainSize -= nCopy; + // next new segment. + offset = 0; + if (remainSize == 0) { + return; + } + } else { + // remain is negative, let's advance to next segment + // now the offset = offset - segmentSize (-remain) + offset = -remain; + } + } + } + + /** + * Copy segments to target unsafe pointer. + * + * @param segments Source segments. + * @param offset The position where the bytes are started to be read from these memory segments. + * @param target The unsafe memory to copy the bytes to. + * @param pointer The position in the target unsafe memory to copy the chunk to. + * @param numBytes the number bytes to copy. + */ + public static void copyToUnsafe( + MemorySegment[] segments, int offset, Object target, int pointer, int numBytes) { + if (inFirstSegment(segments, offset, numBytes)) { + segments[0].copyToUnsafe(offset, target, pointer, numBytes); + } else { + copyMultiSegmentsToUnsafe(segments, offset, target, pointer, numBytes); + } + } + + private static void copyMultiSegmentsToUnsafe( + MemorySegment[] segments, int offset, Object target, int pointer, int numBytes) { + int remainSize = numBytes; + for (MemorySegment segment : segments) { + int remain = segment.size() - offset; + if (remain > 0) { + int nCopy = Math.min(remain, remainSize); + segment.copyToUnsafe(offset, target, numBytes - remainSize + pointer, nCopy); + remainSize -= nCopy; + // next new segment. + offset = 0; + if (remainSize == 0) { + return; + } + } else { + // remain is negative, let's advance to next segment + // now the offset = offset - segmentSize (-remain) + offset = -remain; + } + } + } + + /** + * Copy bytes of segments to output view. + * + *

Note: It just copies the data in, not include the length. + * + * @param segments source segments + * @param offset offset for segments + * @param sizeInBytes size in bytes + * @param target target output view + */ + public static void copyToView( + MemorySegment[] segments, int offset, int sizeInBytes, DataOutputView target) + throws IOException { + for (MemorySegment sourceSegment : segments) { + int curSegRemain = sourceSegment.size() - offset; + if (curSegRemain > 0) { + int copySize = Math.min(curSegRemain, sizeInBytes); + + byte[] bytes = allocateReuseBytes(copySize); + sourceSegment.get(offset, bytes, 0, copySize); + target.write(bytes, 0, copySize); + + sizeInBytes -= copySize; + offset = 0; + } else { + offset -= sourceSegment.size(); + } + + if (sizeInBytes == 0) { + return; + } + } + + if (sizeInBytes != 0) { + throw new RuntimeException( + "No copy finished, this should be a bug, " + + "The remaining length is: " + + sizeInBytes); + } + } + + /** + * Copy target segments from source byte[]. + * + * @param segments target segments. + * @param offset target segments offset. + * @param bytes source byte[]. + * @param bytesOffset source byte[] offset. + * @param numBytes the number bytes to copy. + */ + public static void copyFromBytes( + MemorySegment[] segments, int offset, byte[] bytes, int bytesOffset, int numBytes) { + if (segments.length == 1) { + segments[0].put(offset, bytes, bytesOffset, numBytes); + } else { + copyMultiSegmentsFromBytes(segments, offset, bytes, bytesOffset, numBytes); + } + } + + private static void copyMultiSegmentsFromBytes( + MemorySegment[] segments, int offset, byte[] bytes, int bytesOffset, int numBytes) { + int remainSize = numBytes; + for (MemorySegment segment : segments) { + int remain = segment.size() - offset; + if (remain > 0) { + int nCopy = Math.min(remain, remainSize); + segment.put(offset, bytes, numBytes - remainSize + bytesOffset, nCopy); + remainSize -= nCopy; + // next new segment. + offset = 0; + if (remainSize == 0) { + return; + } + } else { + // remain is negative, let's advance to next segment + // now the offset = offset - segmentSize (-remain) + offset = -remain; + } + } + } + + /** Maybe not copied, if want copy, please use copyTo. */ + public static byte[] getBytes(MemorySegment[] segments, int baseOffset, int sizeInBytes) { + // avoid copy if `base` is `byte[]` + if (segments.length == 1) { + byte[] heapMemory = segments[0].getHeapMemory(); + if (baseOffset == 0 && heapMemory != null && heapMemory.length == sizeInBytes) { + return heapMemory; + } else { + byte[] bytes = new byte[sizeInBytes]; + segments[0].get(baseOffset, bytes, 0, sizeInBytes); + return bytes; + } + } else { + byte[] bytes = new byte[sizeInBytes]; + copyMultiSegmentsToBytes(segments, baseOffset, bytes, 0, sizeInBytes); + return bytes; + } + } + + /** + * Equals two memory segments regions. + * + * @param segments1 Segments 1 + * @param offset1 Offset of segments1 to start equaling + * @param segments2 Segments 2 + * @param offset2 Offset of segments2 to start equaling + * @param len Length of the equaled memory region + * @return true if equal, false otherwise + */ + public static boolean equals( + MemorySegment[] segments1, + int offset1, + MemorySegment[] segments2, + int offset2, + int len) { + if (inFirstSegment(segments1, offset1, len) && inFirstSegment(segments2, offset2, len)) { + return segments1[0].equalTo(segments2[0], offset1, offset2, len); + } else { + return equalsMultiSegments(segments1, offset1, segments2, offset2, len); + } + } + + public static boolean equalsMultiSegments( + MemorySegment[] segments1, + int offset1, + MemorySegment[] segments2, + int offset2, + int len) { + if (len == 0) { + // quick way and avoid segSize is zero. + return true; + } + + int segSize1 = segments1[0].size(); + int segSize2 = segments2[0].size(); + + // find first segIndex and segOffset of segments. + int segIndex1 = offset1 / segSize1; + int segIndex2 = offset2 / segSize2; + int segOffset1 = offset1 - segSize1 * segIndex1; // equal to % + int segOffset2 = offset2 - segSize2 * segIndex2; // equal to % + + while (len > 0) { + int equalLen = Math.min(Math.min(len, segSize1 - segOffset1), segSize2 - segOffset2); + if (!segments1[segIndex1].equalTo( + segments2[segIndex2], segOffset1, segOffset2, equalLen)) { + return false; + } + len -= equalLen; + segOffset1 += equalLen; + if (segOffset1 == segSize1) { + segOffset1 = 0; + segIndex1++; + } + segOffset2 += equalLen; + if (segOffset2 == segSize2) { + segOffset2 = 0; + segIndex2++; + } + } + return true; + } + + /** + * hash segments to int, numBytes must be aligned to 4 bytes. + * + * @param segments Source segments. + * @param offset Source segments offset. + * @param numBytes the number bytes to hash. + */ + public static int hashByWords(MemorySegment[] segments, int offset, int numBytes) { + if (inFirstSegment(segments, offset, numBytes)) { + return MurmurHashUtils.hashBytesByWords(segments[0], offset, numBytes); + } else { + return hashMultiSegByWords(segments, offset, numBytes); + } + } + + private static int hashMultiSegByWords(MemorySegment[] segments, int offset, int numBytes) { + byte[] bytes = allocateReuseBytes(numBytes); + copyMultiSegmentsToBytes(segments, offset, bytes, 0, numBytes); + return MurmurHashUtils.hashUnsafeBytesByWords(bytes, BYTE_ARRAY_BASE_OFFSET, numBytes); + } + + /** + * hash segments to int. + * + * @param segments Source segments. + * @param offset Source segments offset. + * @param numBytes the number bytes to hash. + */ + public static int hash(MemorySegment[] segments, int offset, int numBytes) { + if (inFirstSegment(segments, offset, numBytes)) { + return MurmurHashUtils.hashBytes(segments[0], offset, numBytes); + } else { + return hashMultiSeg(segments, offset, numBytes); + } + } + + private static int hashMultiSeg(MemorySegment[] segments, int offset, int numBytes) { + byte[] bytes = allocateReuseBytes(numBytes); + copyMultiSegmentsToBytes(segments, offset, bytes, 0, numBytes); + return MurmurHashUtils.hashUnsafeBytes(bytes, BYTE_ARRAY_BASE_OFFSET, numBytes); + } + + /** Is it just in first MemorySegment, we use quick way to do something. */ + private static boolean inFirstSegment(MemorySegment[] segments, int offset, int numBytes) { + return numBytes + offset <= segments[0].size(); + } + + /** + * Given a bit index, return the byte index containing it. + * + * @param bitIndex the bit index. + * @return the byte index. + */ + private static int byteIndex(int bitIndex) { + return bitIndex >>> ADDRESS_BITS_PER_WORD; + } + + /** + * unset bit. + * + * @param segment target segment. + * @param baseOffset bits base offset. + * @param index bit index from base offset. + */ + public static void bitUnSet(MemorySegment segment, int baseOffset, int index) { + int offset = baseOffset + byteIndex(index); + byte current = segment.get(offset); + current &= ~(1 << (index & BIT_BYTE_INDEX_MASK)); + segment.put(offset, current); + } + + /** + * set bit. + * + * @param segment target segment. + * @param baseOffset bits base offset. + * @param index bit index from base offset. + */ + public static void bitSet(MemorySegment segment, int baseOffset, int index) { + int offset = baseOffset + byteIndex(index); + byte current = segment.get(offset); + current |= (1 << (index & BIT_BYTE_INDEX_MASK)); + segment.put(offset, current); + } + + /** + * read bit. + * + * @param segment target segment. + * @param baseOffset bits base offset. + * @param index bit index from base offset. + */ + public static boolean bitGet(MemorySegment segment, int baseOffset, int index) { + int offset = baseOffset + byteIndex(index); + byte current = segment.get(offset); + return (current & (1 << (index & BIT_BYTE_INDEX_MASK))) != 0; + } + + /** + * unset bit from segments. + * + * @param segments target segments. + * @param baseOffset bits base offset. + * @param index bit index from base offset. + */ + public static void bitUnSet(MemorySegment[] segments, int baseOffset, int index) { + if (segments.length == 1) { + MemorySegment segment = segments[0]; + int offset = baseOffset + byteIndex(index); + byte current = segment.get(offset); + current &= ~(1 << (index & BIT_BYTE_INDEX_MASK)); + segment.put(offset, current); + } else { + bitUnSetMultiSegments(segments, baseOffset, index); + } + } + + private static void bitUnSetMultiSegments(MemorySegment[] segments, int baseOffset, int index) { + int offset = baseOffset + byteIndex(index); + int segSize = segments[0].size(); + int segIndex = offset / segSize; + int segOffset = offset - segIndex * segSize; // equal to % + MemorySegment segment = segments[segIndex]; + + byte current = segment.get(segOffset); + current &= ~(1 << (index & BIT_BYTE_INDEX_MASK)); + segment.put(segOffset, current); + } + + /** + * set bit from segments. + * + * @param segments target segments. + * @param baseOffset bits base offset. + * @param index bit index from base offset. + */ + public static void bitSet(MemorySegment[] segments, int baseOffset, int index) { + if (segments.length == 1) { + int offset = baseOffset + byteIndex(index); + MemorySegment segment = segments[0]; + byte current = segment.get(offset); + current |= (1 << (index & BIT_BYTE_INDEX_MASK)); + segment.put(offset, current); + } else { + bitSetMultiSegments(segments, baseOffset, index); + } + } + + private static void bitSetMultiSegments(MemorySegment[] segments, int baseOffset, int index) { + int offset = baseOffset + byteIndex(index); + int segSize = segments[0].size(); + int segIndex = offset / segSize; + int segOffset = offset - segIndex * segSize; // equal to % + MemorySegment segment = segments[segIndex]; + + byte current = segment.get(segOffset); + current |= (1 << (index & BIT_BYTE_INDEX_MASK)); + segment.put(segOffset, current); + } + + /** + * read bit from segments. + * + * @param segments target segments. + * @param baseOffset bits base offset. + * @param index bit index from base offset. + */ + public static boolean bitGet(MemorySegment[] segments, int baseOffset, int index) { + int offset = baseOffset + byteIndex(index); + byte current = getByte(segments, offset); + return (current & (1 << (index & BIT_BYTE_INDEX_MASK))) != 0; + } + + /** + * get boolean from segments. + * + * @param segments target segments. + * @param offset value offset. + */ + public static boolean getBoolean(MemorySegment[] segments, int offset) { + if (inFirstSegment(segments, offset, 1)) { + return segments[0].getBoolean(offset); + } else { + return getBooleanMultiSegments(segments, offset); + } + } + + private static boolean getBooleanMultiSegments(MemorySegment[] segments, int offset) { + int segSize = segments[0].size(); + int segIndex = offset / segSize; + int segOffset = offset - segIndex * segSize; // equal to % + return segments[segIndex].getBoolean(segOffset); + } + + /** + * set boolean from segments. + * + * @param segments target segments. + * @param offset value offset. + */ + public static void setBoolean(MemorySegment[] segments, int offset, boolean value) { + if (inFirstSegment(segments, offset, 1)) { + segments[0].putBoolean(offset, value); + } else { + setBooleanMultiSegments(segments, offset, value); + } + } + + private static void setBooleanMultiSegments( + MemorySegment[] segments, int offset, boolean value) { + int segSize = segments[0].size(); + int segIndex = offset / segSize; + int segOffset = offset - segIndex * segSize; // equal to % + segments[segIndex].putBoolean(segOffset, value); + } + + /** + * get byte from segments. + * + * @param segments target segments. + * @param offset value offset. + */ + public static byte getByte(MemorySegment[] segments, int offset) { + if (inFirstSegment(segments, offset, 1)) { + return segments[0].get(offset); + } else { + return getByteMultiSegments(segments, offset); + } + } + + private static byte getByteMultiSegments(MemorySegment[] segments, int offset) { + int segSize = segments[0].size(); + int segIndex = offset / segSize; + int segOffset = offset - segIndex * segSize; // equal to % + return segments[segIndex].get(segOffset); + } + + /** + * set byte from segments. + * + * @param segments target segments. + * @param offset value offset. + */ + public static void setByte(MemorySegment[] segments, int offset, byte value) { + if (inFirstSegment(segments, offset, 1)) { + segments[0].put(offset, value); + } else { + setByteMultiSegments(segments, offset, value); + } + } + + private static void setByteMultiSegments(MemorySegment[] segments, int offset, byte value) { + int segSize = segments[0].size(); + int segIndex = offset / segSize; + int segOffset = offset - segIndex * segSize; // equal to % + segments[segIndex].put(segOffset, value); + } + + /** + * get int from segments. + * + * @param segments target segments. + * @param offset value offset. + */ + public static int getInt(MemorySegment[] segments, int offset) { + if (inFirstSegment(segments, offset, 4)) { + return segments[0].getInt(offset); + } else { + return getIntMultiSegments(segments, offset); + } + } + + private static int getIntMultiSegments(MemorySegment[] segments, int offset) { + int segSize = segments[0].size(); + int segIndex = offset / segSize; + int segOffset = offset - segIndex * segSize; // equal to % + + if (segOffset < segSize - 3) { + return segments[segIndex].getInt(segOffset); + } else { + return getIntSlowly(segments, segSize, segIndex, segOffset); + } + } + + private static int getIntSlowly( + MemorySegment[] segments, int segSize, int segNum, int segOffset) { + MemorySegment segment = segments[segNum]; + int ret = 0; + for (int i = 0; i < 4; i++) { + if (segOffset == segSize) { + segment = segments[++segNum]; + segOffset = 0; + } + int unsignedByte = segment.get(segOffset) & 0xff; + if (LITTLE_ENDIAN) { + ret |= (unsignedByte << (i * 8)); + } else { + ret |= (unsignedByte << ((3 - i) * 8)); + } + segOffset++; + } + return ret; + } + + /** + * set int from segments. + * + * @param segments target segments. + * @param offset value offset. + */ + public static void setInt(MemorySegment[] segments, int offset, int value) { + if (inFirstSegment(segments, offset, 4)) { + segments[0].putInt(offset, value); + } else { + setIntMultiSegments(segments, offset, value); + } + } + + private static void setIntMultiSegments(MemorySegment[] segments, int offset, int value) { + int segSize = segments[0].size(); + int segIndex = offset / segSize; + int segOffset = offset - segIndex * segSize; // equal to % + + if (segOffset < segSize - 3) { + segments[segIndex].putInt(segOffset, value); + } else { + setIntSlowly(segments, segSize, segIndex, segOffset, value); + } + } + + private static void setIntSlowly( + MemorySegment[] segments, int segSize, int segNum, int segOffset, int value) { + MemorySegment segment = segments[segNum]; + for (int i = 0; i < 4; i++) { + if (segOffset == segSize) { + segment = segments[++segNum]; + segOffset = 0; + } + int unsignedByte; + if (LITTLE_ENDIAN) { + unsignedByte = value >> (i * 8); + } else { + unsignedByte = value >> ((3 - i) * 8); + } + segment.put(segOffset, (byte) unsignedByte); + segOffset++; + } + } + + /** + * get long from segments. + * + * @param segments target segments. + * @param offset value offset. + */ + public static long getLong(MemorySegment[] segments, int offset) { + if (inFirstSegment(segments, offset, 8)) { + return segments[0].getLong(offset); + } else { + return getLongMultiSegments(segments, offset); + } + } + + private static long getLongMultiSegments(MemorySegment[] segments, int offset) { + int segSize = segments[0].size(); + int segIndex = offset / segSize; + int segOffset = offset - segIndex * segSize; // equal to % + + if (segOffset < segSize - 7) { + return segments[segIndex].getLong(segOffset); + } else { + return getLongSlowly(segments, segSize, segIndex, segOffset); + } + } + + private static long getLongSlowly( + MemorySegment[] segments, int segSize, int segNum, int segOffset) { + MemorySegment segment = segments[segNum]; + long ret = 0; + for (int i = 0; i < 8; i++) { + if (segOffset == segSize) { + segment = segments[++segNum]; + segOffset = 0; + } + long unsignedByte = segment.get(segOffset) & 0xff; + if (LITTLE_ENDIAN) { + ret |= (unsignedByte << (i * 8)); + } else { + ret |= (unsignedByte << ((7 - i) * 8)); + } + segOffset++; + } + return ret; + } + + /** + * set long from segments. + * + * @param segments target segments. + * @param offset value offset. + */ + public static void setLong(MemorySegment[] segments, int offset, long value) { + if (inFirstSegment(segments, offset, 8)) { + segments[0].putLong(offset, value); + } else { + setLongMultiSegments(segments, offset, value); + } + } + + private static void setLongMultiSegments(MemorySegment[] segments, int offset, long value) { + int segSize = segments[0].size(); + int segIndex = offset / segSize; + int segOffset = offset - segIndex * segSize; // equal to % + + if (segOffset < segSize - 7) { + segments[segIndex].putLong(segOffset, value); + } else { + setLongSlowly(segments, segSize, segIndex, segOffset, value); + } + } + + private static void setLongSlowly( + MemorySegment[] segments, int segSize, int segNum, int segOffset, long value) { + MemorySegment segment = segments[segNum]; + for (int i = 0; i < 8; i++) { + if (segOffset == segSize) { + segment = segments[++segNum]; + segOffset = 0; + } + long unsignedByte; + if (LITTLE_ENDIAN) { + unsignedByte = value >> (i * 8); + } else { + unsignedByte = value >> ((7 - i) * 8); + } + segment.put(segOffset, (byte) unsignedByte); + segOffset++; + } + } + + /** + * get short from segments. + * + * @param segments target segments. + * @param offset value offset. + */ + public static short getShort(MemorySegment[] segments, int offset) { + if (inFirstSegment(segments, offset, 2)) { + return segments[0].getShort(offset); + } else { + return getShortMultiSegments(segments, offset); + } + } + + private static short getShortMultiSegments(MemorySegment[] segments, int offset) { + int segSize = segments[0].size(); + int segIndex = offset / segSize; + int segOffset = offset - segIndex * segSize; // equal to % + + if (segOffset < segSize - 1) { + return segments[segIndex].getShort(segOffset); + } else { + return (short) getTwoByteSlowly(segments, segSize, segIndex, segOffset); + } + } + + /** + * set short from segments. + * + * @param segments target segments. + * @param offset value offset. + */ + public static void setShort(MemorySegment[] segments, int offset, short value) { + if (inFirstSegment(segments, offset, 2)) { + segments[0].putShort(offset, value); + } else { + setShortMultiSegments(segments, offset, value); + } + } + + private static void setShortMultiSegments(MemorySegment[] segments, int offset, short value) { + int segSize = segments[0].size(); + int segIndex = offset / segSize; + int segOffset = offset - segIndex * segSize; // equal to % + + if (segOffset < segSize - 1) { + segments[segIndex].putShort(segOffset, value); + } else { + setTwoByteSlowly(segments, segSize, segIndex, segOffset, value, value >> 8); + } + } + + /** + * get float from segments. + * + * @param segments target segments. + * @param offset value offset. + */ + public static float getFloat(MemorySegment[] segments, int offset) { + if (inFirstSegment(segments, offset, 4)) { + return segments[0].getFloat(offset); + } else { + return getFloatMultiSegments(segments, offset); + } + } + + private static float getFloatMultiSegments(MemorySegment[] segments, int offset) { + int segSize = segments[0].size(); + int segIndex = offset / segSize; + int segOffset = offset - segIndex * segSize; // equal to % + + if (segOffset < segSize - 3) { + return segments[segIndex].getFloat(segOffset); + } else { + return Float.intBitsToFloat(getIntSlowly(segments, segSize, segIndex, segOffset)); + } + } + + /** + * set float from segments. + * + * @param segments target segments. + * @param offset value offset. + */ + public static void setFloat(MemorySegment[] segments, int offset, float value) { + if (inFirstSegment(segments, offset, 4)) { + segments[0].putFloat(offset, value); + } else { + setFloatMultiSegments(segments, offset, value); + } + } + + private static void setFloatMultiSegments(MemorySegment[] segments, int offset, float value) { + int segSize = segments[0].size(); + int segIndex = offset / segSize; + int segOffset = offset - segIndex * segSize; // equal to % + + if (segOffset < segSize - 3) { + segments[segIndex].putFloat(segOffset, value); + } else { + setIntSlowly(segments, segSize, segIndex, segOffset, Float.floatToRawIntBits(value)); + } + } + + /** + * get double from segments. + * + * @param segments target segments. + * @param offset value offset. + */ + public static double getDouble(MemorySegment[] segments, int offset) { + if (inFirstSegment(segments, offset, 8)) { + return segments[0].getDouble(offset); + } else { + return getDoubleMultiSegments(segments, offset); + } + } + + private static double getDoubleMultiSegments(MemorySegment[] segments, int offset) { + int segSize = segments[0].size(); + int segIndex = offset / segSize; + int segOffset = offset - segIndex * segSize; // equal to % + + if (segOffset < segSize - 7) { + return segments[segIndex].getDouble(segOffset); + } else { + return Double.longBitsToDouble(getLongSlowly(segments, segSize, segIndex, segOffset)); + } + } + + /** + * set double from segments. + * + * @param segments target segments. + * @param offset value offset. + */ + public static void setDouble(MemorySegment[] segments, int offset, double value) { + if (inFirstSegment(segments, offset, 8)) { + segments[0].putDouble(offset, value); + } else { + setDoubleMultiSegments(segments, offset, value); + } + } + + private static void setDoubleMultiSegments(MemorySegment[] segments, int offset, double value) { + int segSize = segments[0].size(); + int segIndex = offset / segSize; + int segOffset = offset - segIndex * segSize; // equal to % + + if (segOffset < segSize - 7) { + segments[segIndex].putDouble(segOffset, value); + } else { + setLongSlowly( + segments, segSize, segIndex, segOffset, Double.doubleToRawLongBits(value)); + } + } + + private static int getTwoByteSlowly( + MemorySegment[] segments, int segSize, int segNum, int segOffset) { + MemorySegment segment = segments[segNum]; + int ret = 0; + for (int i = 0; i < 2; i++) { + if (segOffset == segSize) { + segment = segments[++segNum]; + segOffset = 0; + } + int unsignedByte = segment.get(segOffset) & 0xff; + if (LITTLE_ENDIAN) { + ret |= (unsignedByte << (i * 8)); + } else { + ret |= (unsignedByte << ((1 - i) * 8)); + } + segOffset++; + } + return ret; + } + + private static void setTwoByteSlowly( + MemorySegment[] segments, int segSize, int segNum, int segOffset, int b1, int b2) { + MemorySegment segment = segments[segNum]; + segment.put(segOffset, (byte) (LITTLE_ENDIAN ? b1 : b2)); + segOffset++; + if (segOffset == segSize) { + segment = segments[++segNum]; + segOffset = 0; + } + segment.put(segOffset, (byte) (LITTLE_ENDIAN ? b2 : b1)); + } + + /** Gets an instance of {@link DecimalData} from underlying {@link MemorySegment}. */ + public static DecimalData readDecimalData( + MemorySegment[] segments, + int baseOffset, + long offsetAndSize, + int precision, + int scale) { + final int size = ((int) offsetAndSize); + int subOffset = (int) (offsetAndSize >> 32); + byte[] bytes = new byte[size]; + copyToBytes(segments, baseOffset + subOffset, bytes, 0, size); + return DecimalData.fromUnscaledBytes(bytes, precision, scale); + } + + /** + * Gets an instance of {@link TimestampData} from underlying {@link MemorySegment}. + * + * @param segments the underlying MemorySegments + * @param baseOffset the base offset of current instance of {@code TimestampData} + * @param offsetAndNanos the offset of milli-seconds part and nanoseconds + * @return an instance of {@link TimestampData} + */ + public static TimestampData readTimestampData( + MemorySegment[] segments, int baseOffset, long offsetAndNanos) { + final int nanoOfMillisecond = (int) offsetAndNanos; + final int subOffset = (int) (offsetAndNanos >> 32); + final long millisecond = getLong(segments, baseOffset + subOffset); + return TimestampData.fromMillis(millisecond, nanoOfMillisecond); + } + + /** + * Gets an instance of {@link LocalZonedTimestampData} from underlying {@link MemorySegment}. + * + * @param segments the underlying MemorySegments + * @param baseOffset the base offset of current instance of {@code TimestampData} + * @param offsetAndNanos the offset of milli-seconds part and nanoseconds + * @return an instance of {@link LocalZonedTimestampData} + */ + public static LocalZonedTimestampData readLocalZonedTimestampData( + MemorySegment[] segments, int baseOffset, long offsetAndNanos) { + final int nanoOfMillisecond = (int) offsetAndNanos; + final int subOffset = (int) (offsetAndNanos >> 32); + final long millisecond = getLong(segments, baseOffset + subOffset); + return LocalZonedTimestampData.fromEpochMillis(millisecond, nanoOfMillisecond); + } + + /** + * Get binary, if len less than 8, will be include in variablePartOffsetAndLen. + * + *

Note: Need to consider the ByteOrder. + * + * @param baseOffset base offset of composite binary format. + * @param fieldOffset absolute start offset of 'variablePartOffsetAndLen'. + * @param variablePartOffsetAndLen a long value, real data or offset and len. + */ + public static byte[] readBinary( + MemorySegment[] segments, + int baseOffset, + int fieldOffset, + long variablePartOffsetAndLen) { + long mark = variablePartOffsetAndLen & HIGHEST_FIRST_BIT; + if (mark == 0) { + final int subOffset = (int) (variablePartOffsetAndLen >> 32); + final int len = (int) variablePartOffsetAndLen; + return BinarySegmentUtils.copyToBytes(segments, baseOffset + subOffset, len); + } else { + int len = (int) ((variablePartOffsetAndLen & HIGHEST_SECOND_TO_EIGHTH_BIT) >>> 56); + if (BinarySegmentUtils.LITTLE_ENDIAN) { + return BinarySegmentUtils.copyToBytes(segments, fieldOffset, len); + } else { + // fieldOffset + 1 to skip header. + return BinarySegmentUtils.copyToBytes(segments, fieldOffset + 1, len); + } + } + } + + /** + * Get binary string, if len less than 8, will be include in variablePartOffsetAndLen. + * + *

Note: Need to consider the ByteOrder. + * + * @param baseOffset base offset of composite binary format. + * @param fieldOffset absolute start offset of 'variablePartOffsetAndLen'. + * @param variablePartOffsetAndLen a long value, real data or offset and len. + */ + public static StringData readStringData( + MemorySegment[] segments, + int baseOffset, + int fieldOffset, + long variablePartOffsetAndLen) { + long mark = variablePartOffsetAndLen & HIGHEST_FIRST_BIT; + if (mark == 0) { + final int subOffset = (int) (variablePartOffsetAndLen >> 32); + final int len = (int) variablePartOffsetAndLen; + return BinaryStringData.fromAddress(segments, baseOffset + subOffset, len); + } else { + int len = (int) ((variablePartOffsetAndLen & HIGHEST_SECOND_TO_EIGHTH_BIT) >>> 56); + if (BinarySegmentUtils.LITTLE_ENDIAN) { + return BinaryStringData.fromAddress(segments, fieldOffset, len); + } else { + // fieldOffset + 1 to skip header. + return BinaryStringData.fromAddress(segments, fieldOffset + 1, len); + } + } + } + + /** Gets an instance of {@link RecordData} from underlying {@link MemorySegment}. */ + public static RecordData readRecordData( + MemorySegment[] segments, int numFields, int baseOffset, long offsetAndSize) { + final int size = ((int) offsetAndSize); + int offset = (int) (offsetAndSize >> 32); + BinaryRecordData recordData = new BinaryRecordData(numFields); + recordData.pointTo(segments, offset + baseOffset, size); + return recordData; + } + + public static BinaryVariant readVariant( + MemorySegment[] segments, int baseOffset, long offsetAndSize) { + final int size = ((int) offsetAndSize); + int offset = (int) (offsetAndSize >> 32); + byte[] bytes = copyToBytes(segments, offset + baseOffset, size); + ByteBuffer buffer = ByteBuffer.wrap(bytes); + int metaLen = buffer.getInt(); + int valueLen = bytes.length - 4 - metaLen; + + byte[] meta = new byte[metaLen]; + byte[] value = new byte[valueLen]; + buffer.get(meta, 0, metaLen); + buffer.get(value, 0, valueLen); + + return new BinaryVariant(value, meta); + } + + /** + * Find equal segments2 in segments1. + * + * @param segments1 segs to find. + * @param segments2 sub segs. + * @return Return the found offset, return -1 if not find. + */ + public static int find( + MemorySegment[] segments1, + int offset1, + int numBytes1, + MemorySegment[] segments2, + int offset2, + int numBytes2) { + if (numBytes2 == 0) { // quick way 1. + return offset1; + } + if (inFirstSegment(segments1, offset1, numBytes1) + && inFirstSegment(segments2, offset2, numBytes2)) { + byte first = segments2[0].get(offset2); + int end = numBytes1 - numBytes2 + offset1; + for (int i = offset1; i <= end; i++) { + // quick way 2: equal first byte. + if (segments1[0].get(i) == first + && segments1[0].equalTo(segments2[0], i, offset2, numBytes2)) { + return i; + } + } + return -1; + } else { + return findInMultiSegments( + segments1, offset1, numBytes1, segments2, offset2, numBytes2); + } + } + + private static int findInMultiSegments( + MemorySegment[] segments1, + int offset1, + int numBytes1, + MemorySegment[] segments2, + int offset2, + int numBytes2) { + int end = numBytes1 - numBytes2 + offset1; + for (int i = offset1; i <= end; i++) { + if (equalsMultiSegments(segments1, i, segments2, offset2, numBytes2)) { + return i; + } + } + return -1; + } + + /** Gets an instance of {@link MapData} from underlying {@link MemorySegment}. */ + public static MapData readMapData( + MemorySegment[] segments, int baseOffset, long offsetAndSize) { + final int size = ((int) offsetAndSize); + int offset = (int) (offsetAndSize >> 32); + BinaryMapData map = new BinaryMapData(); + map.pointTo(segments, offset + baseOffset, size); + return map; + } + + /** Gets an instance of {@link ArrayData} from underlying {@link MemorySegment}. */ + public static ArrayData readArrayData( + MemorySegment[] segments, int baseOffset, long offsetAndSize) { + final int size = ((int) offsetAndSize); + int offset = (int) (offsetAndSize >> 32); + BinaryArrayData array = new BinaryArrayData(); + array.pointTo(segments, offset + baseOffset, size); + return array; + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinaryStringData.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinaryStringData.java new file mode 100644 index 00000000000..52af6a25c49 --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/BinaryStringData.java @@ -0,0 +1,875 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.data.binary; + +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.cdc.common.annotation.Internal; +import org.apache.flink.cdc.common.data.StringData; +import org.apache.flink.cdc.common.utils.Preconditions; +import org.apache.flink.cdc.common.utils.StringUtf8Utils; +import org.apache.flink.core.memory.MemorySegment; +import org.apache.flink.core.memory.MemorySegmentFactory; + +import javax.annotation.Nonnull; + +import java.util.Arrays; + +/** + * A lazily binary implementation of {@link StringData} which is backed by {@link MemorySegment}s + * and {@link String}. + * + *

Either {@link MemorySegment}s or {@link String} must be provided when constructing {@link + * BinaryStringData}. The other representation will be materialized when needed. + * + *

It provides many useful methods for comparison, search, and so on. + */ +@Internal +public final class BinaryStringData extends LazyBinaryFormat implements StringData { + + public static final BinaryStringData EMPTY_UTF8 = + BinaryStringData.fromBytes(StringUtf8Utils.encodeUTF8("")); + + public BinaryStringData() {} + + public BinaryStringData(String javaObject) { + super(javaObject); + } + + public BinaryStringData(MemorySegment[] segments, int offset, int sizeInBytes) { + super(segments, offset, sizeInBytes); + } + + public BinaryStringData( + MemorySegment[] segments, int offset, int sizeInBytes, String javaObject) { + super(segments, offset, sizeInBytes, javaObject); + } + + // ------------------------------------------------------------------------------------------ + // Construction Utilities + // ------------------------------------------------------------------------------------------ + + /** + * Creates a {@link BinaryStringData} instance from the given address (base and offset) and + * length. + */ + public static BinaryStringData fromAddress(MemorySegment[] segments, int offset, int numBytes) { + return new BinaryStringData(segments, offset, numBytes); + } + + /** Creates a {@link BinaryStringData} instance from the given Java string. */ + public static BinaryStringData fromString(String str) { + if (str == null) { + return null; + } else { + return new BinaryStringData(str); + } + } + + /** Creates a {@link BinaryStringData} instance from the given UTF-8 bytes. */ + public static BinaryStringData fromBytes(byte[] bytes) { + return fromBytes(bytes, 0, bytes.length); + } + + /** + * Creates a {@link BinaryStringData} instance from the given UTF-8 bytes with offset and number + * of bytes. + */ + public static BinaryStringData fromBytes(byte[] bytes, int offset, int numBytes) { + return new BinaryStringData( + new MemorySegment[] {MemorySegmentFactory.wrap(bytes)}, offset, numBytes); + } + + /** Creates a {@link BinaryStringData} instance that contains `length` spaces. */ + public static BinaryStringData blankString(int length) { + byte[] spaces = new byte[length]; + Arrays.fill(spaces, (byte) ' '); + return fromBytes(spaces); + } + + // ------------------------------------------------------------------------------------------ + // Public Interfaces + // ------------------------------------------------------------------------------------------ + + @Override + public byte[] toBytes() { + ensureMaterialized(); + return BinarySegmentUtils.getBytes( + binarySection.segments, binarySection.offset, binarySection.sizeInBytes); + } + + @Override + public boolean equals(Object o) { + if (o instanceof BinaryStringData) { + BinaryStringData other = (BinaryStringData) o; + if (javaObject != null && other.javaObject != null) { + return javaObject.equals(other.javaObject); + } + + ensureMaterialized(); + other.ensureMaterialized(); + return binarySection.equals(other.binarySection); + } else { + return false; + } + } + + @Override + public int hashCode() { + ensureMaterialized(); + return binarySection.hashCode(); + } + + @Override + public String toString() { + if (javaObject == null) { + byte[] bytes = BinarySegmentUtils.allocateReuseBytes(binarySection.sizeInBytes); + BinarySegmentUtils.copyToBytes( + binarySection.segments, + binarySection.offset, + bytes, + 0, + binarySection.sizeInBytes); + javaObject = StringUtf8Utils.decodeUTF8(bytes, 0, binarySection.sizeInBytes); + } + return javaObject; + } + + /** + * Compares two strings lexicographically. Since UTF-8 uses groups of six bits, it is sometimes + * useful to use octal notation which uses 3-bit groups. With a calculator which can convert + * between hexadecimal and octal it can be easier to manually create or interpret UTF-8 compared + * with using binary. So we just compare the binary. + */ + @Override + public int compareTo(@Nonnull StringData o) { + // BinaryStringData is the only implementation of StringData + BinaryStringData other = (BinaryStringData) o; + if (javaObject != null && other.javaObject != null) { + return javaObject.compareTo(other.javaObject); + } + + ensureMaterialized(); + other.ensureMaterialized(); + if (binarySection.segments.length == 1 && other.binarySection.segments.length == 1) { + + int len = Math.min(binarySection.sizeInBytes, other.binarySection.sizeInBytes); + MemorySegment seg1 = binarySection.segments[0]; + MemorySegment seg2 = other.binarySection.segments[0]; + + for (int i = 0; i < len; i++) { + int res = + (seg1.get(binarySection.offset + i) & 0xFF) + - (seg2.get(other.binarySection.offset + i) & 0xFF); + if (res != 0) { + return res; + } + } + return binarySection.sizeInBytes - other.binarySection.sizeInBytes; + } + + // if there are multi segments. + return compareMultiSegments(other); + } + + /** Find the boundaries of segments, and then compare MemorySegment. */ + private int compareMultiSegments(BinaryStringData other) { + + if (binarySection.sizeInBytes == 0 || other.binarySection.sizeInBytes == 0) { + return binarySection.sizeInBytes - other.binarySection.sizeInBytes; + } + + int len = Math.min(binarySection.sizeInBytes, other.binarySection.sizeInBytes); + + MemorySegment seg1 = binarySection.segments[0]; + MemorySegment seg2 = other.binarySection.segments[0]; + + int segmentSize = binarySection.segments[0].size(); + int otherSegmentSize = other.binarySection.segments[0].size(); + + int sizeOfFirst1 = segmentSize - binarySection.offset; + int sizeOfFirst2 = otherSegmentSize - other.binarySection.offset; + + int varSegIndex1 = 1; + int varSegIndex2 = 1; + + // find the first segment of this string. + while (sizeOfFirst1 <= 0) { + sizeOfFirst1 += segmentSize; + seg1 = binarySection.segments[varSegIndex1++]; + } + + while (sizeOfFirst2 <= 0) { + sizeOfFirst2 += otherSegmentSize; + seg2 = other.binarySection.segments[varSegIndex2++]; + } + + int offset1 = segmentSize - sizeOfFirst1; + int offset2 = otherSegmentSize - sizeOfFirst2; + + int needCompare = Math.min(Math.min(sizeOfFirst1, sizeOfFirst2), len); + + while (needCompare > 0) { + // compare in one segment. + for (int i = 0; i < needCompare; i++) { + int res = (seg1.get(offset1 + i) & 0xFF) - (seg2.get(offset2 + i) & 0xFF); + if (res != 0) { + return res; + } + } + if (needCompare == len) { + break; + } + len -= needCompare; + // next segment + if (sizeOfFirst1 < sizeOfFirst2) { // I am smaller + seg1 = binarySection.segments[varSegIndex1++]; + offset1 = 0; + offset2 += needCompare; + sizeOfFirst1 = segmentSize; + sizeOfFirst2 -= needCompare; + } else if (sizeOfFirst1 > sizeOfFirst2) { // other is smaller + seg2 = other.binarySection.segments[varSegIndex2++]; + offset2 = 0; + offset1 += needCompare; + sizeOfFirst2 = otherSegmentSize; + sizeOfFirst1 -= needCompare; + } else { // same, should go ahead both. + seg1 = binarySection.segments[varSegIndex1++]; + seg2 = other.binarySection.segments[varSegIndex2++]; + offset1 = 0; + offset2 = 0; + sizeOfFirst1 = segmentSize; + sizeOfFirst2 = otherSegmentSize; + } + needCompare = Math.min(Math.min(sizeOfFirst1, sizeOfFirst2), len); + } + + Preconditions.checkArgument(needCompare == len); + + return binarySection.sizeInBytes - other.binarySection.sizeInBytes; + } + + // ------------------------------------------------------------------------------------------ + // Public methods on BinaryStringData + // ------------------------------------------------------------------------------------------ + + /** Returns the number of UTF-8 code points in the string. */ + public int numChars() { + ensureMaterialized(); + if (inFirstSegment()) { + int len = 0; + for (int i = 0; + i < binarySection.sizeInBytes; + i += numBytesForFirstByte(getByteOneSegment(i))) { + len++; + } + return len; + } else { + return numCharsMultiSegs(); + } + } + + private int numCharsMultiSegs() { + int len = 0; + int segSize = binarySection.segments[0].size(); + SegmentAndOffset index = firstSegmentAndOffset(segSize); + int i = 0; + while (i < binarySection.sizeInBytes) { + int charBytes = numBytesForFirstByte(index.value()); + i += charBytes; + len++; + index.skipBytes(charBytes, segSize); + } + return len; + } + + /** + * Returns the {@code byte} value at the specified index. An index ranges from {@code 0} to + * {@code binarySection.sizeInBytes - 1}. + * + * @param index the index of the {@code byte} value. + * @return the {@code byte} value at the specified index of this UTF-8 bytes. + * @exception IndexOutOfBoundsException if the {@code index} argument is negative or not less + * than the length of this UTF-8 bytes. + */ + public byte byteAt(int index) { + ensureMaterialized(); + int globalOffset = binarySection.offset + index; + int size = binarySection.segments[0].size(); + if (globalOffset < size) { + return binarySection.segments[0].get(globalOffset); + } else { + return binarySection.segments[globalOffset / size].get(globalOffset % size); + } + } + + @Override + public MemorySegment[] getSegments() { + ensureMaterialized(); + return super.getSegments(); + } + + @Override + public int getOffset() { + ensureMaterialized(); + return super.getOffset(); + } + + @Override + public int getSizeInBytes() { + ensureMaterialized(); + return super.getSizeInBytes(); + } + + public void ensureMaterialized() { + ensureMaterialized(null); + } + + @Override + protected BinarySection materialize(TypeSerializer serializer) { + if (serializer != null) { + throw new IllegalArgumentException( + "BinaryStringData does not support custom serializers"); + } + + byte[] bytes = StringUtf8Utils.encodeUTF8(javaObject); + return new BinarySection( + new MemorySegment[] {MemorySegmentFactory.wrap(bytes)}, 0, bytes.length); + } + + /** Copy a new {@code BinaryStringData}. */ + public BinaryStringData copy() { + ensureMaterialized(); + byte[] copy = + BinarySegmentUtils.copyToBytes( + binarySection.segments, binarySection.offset, binarySection.sizeInBytes); + return new BinaryStringData( + new MemorySegment[] {MemorySegmentFactory.wrap(copy)}, + 0, + binarySection.sizeInBytes, + javaObject); + } + + /** + * Returns a binary string that is a substring of this binary string. The substring begins at + * the specified {@code beginIndex} and extends to the character at index {@code endIndex - 1}. + * + *

Examples: + * + *

+ * + *
+     * fromString("hamburger").substring(4, 8) returns binary string "urge"
+     * fromString("smiles").substring(1, 5) returns binary string "mile"
+     * 
+ * + *
+ * + * @param beginIndex the beginning index, inclusive. + * @param endIndex the ending index, exclusive. + * @return the specified substring, return EMPTY_UTF8 when index out of bounds instead of + * StringIndexOutOfBoundsException. + */ + public BinaryStringData substring(int beginIndex, int endIndex) { + ensureMaterialized(); + if (endIndex <= beginIndex || beginIndex >= binarySection.sizeInBytes) { + return EMPTY_UTF8; + } + if (inFirstSegment()) { + MemorySegment segment = binarySection.segments[0]; + int i = 0; + int c = 0; + while (i < binarySection.sizeInBytes && c < beginIndex) { + i += numBytesForFirstByte(segment.get(i + binarySection.offset)); + c += 1; + } + + int j = i; + while (i < binarySection.sizeInBytes && c < endIndex) { + i += numBytesForFirstByte(segment.get(i + binarySection.offset)); + c += 1; + } + + if (i > j) { + byte[] bytes = new byte[i - j]; + segment.get(binarySection.offset + j, bytes, 0, i - j); + return fromBytes(bytes); + } else { + return EMPTY_UTF8; + } + } else { + return substringMultiSegs(beginIndex, endIndex); + } + } + + private BinaryStringData substringMultiSegs(final int start, final int until) { + int segSize = binarySection.segments[0].size(); + SegmentAndOffset index = firstSegmentAndOffset(segSize); + int i = 0; + int c = 0; + while (i < binarySection.sizeInBytes && c < start) { + int charSize = numBytesForFirstByte(index.value()); + i += charSize; + index.skipBytes(charSize, segSize); + c += 1; + } + + int j = i; + while (i < binarySection.sizeInBytes && c < until) { + int charSize = numBytesForFirstByte(index.value()); + i += charSize; + index.skipBytes(charSize, segSize); + c += 1; + } + + if (i > j) { + return fromBytes( + BinarySegmentUtils.copyToBytes( + binarySection.segments, binarySection.offset + j, i - j)); + } else { + return EMPTY_UTF8; + } + } + + /** + * Returns true if and only if this BinaryStringData contains the specified sequence of bytes + * values. + * + * @param s the sequence to search for + * @return true if this BinaryStringData contains {@code s}, false otherwise + */ + public boolean contains(final BinaryStringData s) { + ensureMaterialized(); + s.ensureMaterialized(); + if (s.binarySection.sizeInBytes == 0) { + return true; + } + int find = + BinarySegmentUtils.find( + binarySection.segments, + binarySection.offset, + binarySection.sizeInBytes, + s.binarySection.segments, + s.binarySection.offset, + s.binarySection.sizeInBytes); + return find != -1; + } + + /** + * Tests if this BinaryStringData starts with the specified prefix. + * + * @param prefix the prefix. + * @return {@code true} if the bytes represented by the argument is a prefix of the bytes + * represented by this string; {@code false} otherwise. Note also that {@code true} will be + * returned if the argument is an empty BinaryStringData or is equal to this {@code + * BinaryStringData} object as determined by the {@link #equals(Object)} method. + */ + public boolean startsWith(final BinaryStringData prefix) { + ensureMaterialized(); + prefix.ensureMaterialized(); + return matchAt(prefix, 0); + } + + /** + * Tests if this BinaryStringData ends with the specified suffix. + * + * @param suffix the suffix. + * @return {@code true} if the bytes represented by the argument is a suffix of the bytes + * represented by this object; {@code false} otherwise. Note that the result will be {@code + * true} if the argument is the empty string or is equal to this {@code BinaryStringData} + * object as determined by the {@link #equals(Object)} method. + */ + public boolean endsWith(final BinaryStringData suffix) { + ensureMaterialized(); + suffix.ensureMaterialized(); + return matchAt(suffix, binarySection.sizeInBytes - suffix.binarySection.sizeInBytes); + } + + /** + * Returns a string whose value is this string, with any leading and trailing whitespace + * removed. + * + * @return A string whose value is this string, with any leading and trailing white space + * removed, or this string if it has no leading or trailing white space. + */ + public BinaryStringData trim() { + ensureMaterialized(); + if (inFirstSegment()) { + int s = 0; + int e = this.binarySection.sizeInBytes - 1; + // skip all of the space (0x20) in the left side + while (s < this.binarySection.sizeInBytes && getByteOneSegment(s) == 0x20) { + s++; + } + // skip all of the space (0x20) in the right side + while (e >= s && getByteOneSegment(e) == 0x20) { + e--; + } + if (s > e) { + // empty string + return EMPTY_UTF8; + } else { + return copyBinaryStringInOneSeg(s, e - s + 1); + } + } else { + return trimMultiSegs(); + } + } + + private BinaryStringData trimMultiSegs() { + int s = 0; + int e = this.binarySection.sizeInBytes - 1; + int segSize = binarySection.segments[0].size(); + SegmentAndOffset front = firstSegmentAndOffset(segSize); + // skip all of the space (0x20) in the left side + while (s < this.binarySection.sizeInBytes && front.value() == 0x20) { + s++; + front.nextByte(segSize); + } + SegmentAndOffset behind = lastSegmentAndOffset(segSize); + // skip all of the space (0x20) in the right side + while (e >= s && behind.value() == 0x20) { + e--; + behind.previousByte(segSize); + } + if (s > e) { + // empty string + return EMPTY_UTF8; + } else { + return copyBinaryString(s, e); + } + } + + /** + * Returns the index within this string of the first occurrence of the specified substring, + * starting at the specified index. + * + * @param str the substring to search for. + * @param fromIndex the index from which to start the search. + * @return the index of the first occurrence of the specified substring, starting at the + * specified index, or {@code -1} if there is no such occurrence. + */ + public int indexOf(BinaryStringData str, int fromIndex) { + ensureMaterialized(); + str.ensureMaterialized(); + if (str.binarySection.sizeInBytes == 0) { + return 0; + } + if (inFirstSegment()) { + // position in byte + int byteIdx = 0; + // position is char + int charIdx = 0; + while (byteIdx < binarySection.sizeInBytes && charIdx < fromIndex) { + byteIdx += numBytesForFirstByte(getByteOneSegment(byteIdx)); + charIdx++; + } + do { + if (byteIdx + str.binarySection.sizeInBytes > binarySection.sizeInBytes) { + return -1; + } + if (BinarySegmentUtils.equals( + binarySection.segments, + binarySection.offset + byteIdx, + str.binarySection.segments, + str.binarySection.offset, + str.binarySection.sizeInBytes)) { + return charIdx; + } + byteIdx += numBytesForFirstByte(getByteOneSegment(byteIdx)); + charIdx++; + } while (byteIdx < binarySection.sizeInBytes); + + return -1; + } else { + return indexOfMultiSegs(str, fromIndex); + } + } + + private int indexOfMultiSegs(BinaryStringData str, int fromIndex) { + // position in byte + int byteIdx = 0; + // position is char + int charIdx = 0; + int segSize = binarySection.segments[0].size(); + SegmentAndOffset index = firstSegmentAndOffset(segSize); + while (byteIdx < binarySection.sizeInBytes && charIdx < fromIndex) { + int charBytes = numBytesForFirstByte(index.value()); + byteIdx += charBytes; + charIdx++; + index.skipBytes(charBytes, segSize); + } + do { + if (byteIdx + str.binarySection.sizeInBytes > binarySection.sizeInBytes) { + return -1; + } + if (BinarySegmentUtils.equals( + binarySection.segments, + binarySection.offset + byteIdx, + str.binarySection.segments, + str.binarySection.offset, + str.binarySection.sizeInBytes)) { + return charIdx; + } + int charBytes = numBytesForFirstByte(index.segment.get(index.offset)); + byteIdx += charBytes; + charIdx++; + index.skipBytes(charBytes, segSize); + } while (byteIdx < binarySection.sizeInBytes); + + return -1; + } + + /** + * Converts all of the characters in this {@code BinaryStringData} to upper case. + * + * @return the {@code BinaryStringData}, converted to uppercase. + */ + public BinaryStringData toUpperCase() { + if (javaObject != null) { + return javaToUpperCase(); + } + if (binarySection.sizeInBytes == 0) { + return EMPTY_UTF8; + } + int size = binarySection.segments[0].size(); + SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size); + byte[] bytes = new byte[binarySection.sizeInBytes]; + bytes[0] = (byte) Character.toTitleCase(segmentAndOffset.value()); + for (int i = 0; i < binarySection.sizeInBytes; i++) { + byte b = segmentAndOffset.value(); + if (numBytesForFirstByte(b) != 1) { + // fallback + return javaToUpperCase(); + } + int upper = Character.toUpperCase((int) b); + if (upper > 127) { + // fallback + return javaToUpperCase(); + } + bytes[i] = (byte) upper; + segmentAndOffset.nextByte(size); + } + return fromBytes(bytes); + } + + private BinaryStringData javaToUpperCase() { + return fromString(toString().toUpperCase()); + } + + /** + * Converts all of the characters in this {@code BinaryStringData} to lower case. + * + * @return the {@code BinaryStringData}, converted to lowercase. + */ + public BinaryStringData toLowerCase() { + if (javaObject != null) { + return javaToLowerCase(); + } + if (binarySection.sizeInBytes == 0) { + return EMPTY_UTF8; + } + int size = binarySection.segments[0].size(); + SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size); + byte[] bytes = new byte[binarySection.sizeInBytes]; + bytes[0] = (byte) Character.toTitleCase(segmentAndOffset.value()); + for (int i = 0; i < binarySection.sizeInBytes; i++) { + byte b = segmentAndOffset.value(); + if (numBytesForFirstByte(b) != 1) { + // fallback + return javaToLowerCase(); + } + int lower = Character.toLowerCase((int) b); + if (lower > 127) { + // fallback + return javaToLowerCase(); + } + bytes[i] = (byte) lower; + segmentAndOffset.nextByte(size); + } + return fromBytes(bytes); + } + + private BinaryStringData javaToLowerCase() { + return fromString(toString().toLowerCase()); + } + + // ------------------------------------------------------------------------------------------ + // Internal methods on BinaryStringData + // ------------------------------------------------------------------------------------------ + + byte getByteOneSegment(int i) { + return binarySection.segments[0].get(binarySection.offset + i); + } + + boolean inFirstSegment() { + return binarySection.sizeInBytes + binarySection.offset <= binarySection.segments[0].size(); + } + + private boolean matchAt(final BinaryStringData s, int pos) { + return (inFirstSegment() && s.inFirstSegment()) + ? matchAtOneSeg(s, pos) + : matchAtVarSeg(s, pos); + } + + private boolean matchAtOneSeg(final BinaryStringData s, int pos) { + return s.binarySection.sizeInBytes + pos <= binarySection.sizeInBytes + && pos >= 0 + && binarySection.segments[0].equalTo( + s.binarySection.segments[0], + binarySection.offset + pos, + s.binarySection.offset, + s.binarySection.sizeInBytes); + } + + private boolean matchAtVarSeg(final BinaryStringData s, int pos) { + return s.binarySection.sizeInBytes + pos <= binarySection.sizeInBytes + && pos >= 0 + && BinarySegmentUtils.equals( + binarySection.segments, + binarySection.offset + pos, + s.binarySection.segments, + s.binarySection.offset, + s.binarySection.sizeInBytes); + } + + BinaryStringData copyBinaryStringInOneSeg(int start, int len) { + byte[] newBytes = new byte[len]; + binarySection.segments[0].get(binarySection.offset + start, newBytes, 0, len); + return fromBytes(newBytes); + } + + BinaryStringData copyBinaryString(int start, int end) { + int len = end - start + 1; + byte[] newBytes = new byte[len]; + BinarySegmentUtils.copyToBytes( + binarySection.segments, binarySection.offset + start, newBytes, 0, len); + return fromBytes(newBytes); + } + + SegmentAndOffset firstSegmentAndOffset(int segSize) { + int segIndex = binarySection.offset / segSize; + return new SegmentAndOffset(segIndex, binarySection.offset % segSize); + } + + SegmentAndOffset lastSegmentAndOffset(int segSize) { + int lastOffset = binarySection.offset + binarySection.sizeInBytes - 1; + int segIndex = lastOffset / segSize; + return new SegmentAndOffset(segIndex, lastOffset % segSize); + } + + private SegmentAndOffset startSegmentAndOffset(int segSize) { + return inFirstSegment() + ? new SegmentAndOffset(0, binarySection.offset) + : firstSegmentAndOffset(segSize); + } + + /** CurrentSegment and positionInSegment. */ + class SegmentAndOffset { + int segIndex; + MemorySegment segment; + int offset; + + private SegmentAndOffset(int segIndex, int offset) { + this.segIndex = segIndex; + this.segment = binarySection.segments[segIndex]; + this.offset = offset; + } + + private void assignSegment() { + segment = + segIndex >= 0 && segIndex < binarySection.segments.length + ? binarySection.segments[segIndex] + : null; + } + + void previousByte(int segSize) { + offset--; + if (offset == -1) { + segIndex--; + assignSegment(); + offset = segSize - 1; + } + } + + void nextByte(int segSize) { + offset++; + checkAdvance(segSize); + } + + private void checkAdvance(int segSize) { + if (offset == segSize) { + advance(); + } + } + + private void advance() { + segIndex++; + assignSegment(); + offset = 0; + } + + void skipBytes(int n, int segSize) { + int remaining = segSize - this.offset; + if (remaining > n) { + this.offset += n; + } else { + while (true) { + int toSkip = Math.min(remaining, n); + n -= toSkip; + if (n <= 0) { + this.offset += toSkip; + checkAdvance(segSize); + return; + } + advance(); + remaining = segSize - this.offset; + } + } + } + + byte value() { + return this.segment.get(this.offset); + } + } + + /** + * Returns the number of bytes for a code point with the first byte as `b`. + * + * @param b The first byte of a code point + */ + static int numBytesForFirstByte(final byte b) { + if (b >= 0) { + // 1 byte, 7 bits: 0xxxxxxx + return 1; + } else if ((b >> 5) == -2 && (b & 0x1e) != 0) { + // 2 bytes, 11 bits: 110xxxxx 10xxxxxx + return 2; + } else if ((b >> 4) == -2) { + // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx + return 3; + } else if ((b >> 3) == -2) { + // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + return 4; + } else { + // Skip the first byte disallowed in UTF-8 + // Handling errors quietly, same semantics to java String. + return 1; + } + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/LazyBinaryFormat.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/LazyBinaryFormat.java new file mode 100644 index 00000000000..fe05b042495 --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/LazyBinaryFormat.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.data.binary; + +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.cdc.common.annotation.Internal; +import org.apache.flink.core.memory.MemorySegment; + +import java.io.IOException; + +/** + * An abstract implementation fo {@link BinaryFormat} which is lazily serialized into binary or + * lazily deserialized into Java object. + * + *

The reason why we introduce this data structure is in order to save (de)serialization in + * nested function calls. Consider the following function call chain: + * + *

UDF0(input) -> UDF1(result0) -> UDF2(result1) -> UDF3(result2)
+ * + *

Such nested calls, if the return values of UDFs are Java object format, it will result in + * multiple conversions between Java object and binary format: + * + *

+ * converterToBinary(UDF0(converterToJavaObject(input))) ->
+ *   converterToBinary(UDF1(converterToJavaObject(result0))) ->
+ *     converterToBinary(UDF2(converterToJavaObject(result1))) ->
+ *       ...
+ * 
+ * + *

So we introduced {@link LazyBinaryFormat} to avoid the redundant cost, it has three forms: + * + *

    + *
  • Binary form + *
  • Java object form + *
  • Binary and Java object both exist + *
+ * + *

It can lazy the conversions as much as possible. It will be converted into required form only + * when it is needed. + */ +@Internal +public abstract class LazyBinaryFormat implements BinaryFormat { + + T javaObject; + BinarySection binarySection; + + public LazyBinaryFormat() { + this(null, null); + } + + public LazyBinaryFormat(MemorySegment[] segments, int offset, int sizeInBytes, T javaObject) { + this(javaObject, new BinarySection(segments, offset, sizeInBytes)); + } + + public LazyBinaryFormat(MemorySegment[] segments, int offset, int sizeInBytes) { + this(null, new BinarySection(segments, offset, sizeInBytes)); + } + + public LazyBinaryFormat(T javaObject) { + this(javaObject, null); + } + + public LazyBinaryFormat(T javaObject, BinarySection binarySection) { + this.javaObject = javaObject; + this.binarySection = binarySection; + } + + public T getJavaObject() { + return javaObject; + } + + public BinarySection getBinarySection() { + return binarySection; + } + + /** Must be public as it is used during code generation. */ + public void setJavaObject(T javaObject) { + this.javaObject = javaObject; + } + + @Override + public MemorySegment[] getSegments() { + if (binarySection == null) { + throw new IllegalStateException("Lazy Binary Format was not materialized"); + } + return binarySection.segments; + } + + @Override + public int getOffset() { + if (binarySection == null) { + throw new IllegalStateException("Lazy Binary Format was not materialized"); + } + return binarySection.offset; + } + + @Override + public int getSizeInBytes() { + if (binarySection == null) { + throw new IllegalStateException("Lazy Binary Format was not materialized"); + } + return binarySection.sizeInBytes; + } + + /** Ensure we have materialized binary format. */ + public final void ensureMaterialized(TypeSerializer serializer) { + if (binarySection == null) { + try { + this.binarySection = materialize(serializer); + } catch (IOException e) { + throw new IllegalStateException(e); + } + } + } + + /** + * Materialize java object to binary format. Inherited classes need to hold the information they + * need. + */ + protected abstract BinarySection materialize(TypeSerializer serializer) throws IOException; +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/MurmurHashUtils.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/MurmurHashUtils.java new file mode 100644 index 00000000000..9caa30dfc3b --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/data/binary/MurmurHashUtils.java @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.data.binary; + +import org.apache.flink.cdc.common.annotation.Internal; +import org.apache.flink.core.memory.MemorySegment; + +import static org.apache.flink.core.memory.MemoryUtils.UNSAFE; + +/** Murmur Hash. This is inspired by Guava's Murmur3_32HashFunction. */ +@Internal +final class MurmurHashUtils { + + private static final int C1 = 0xcc9e2d51; + private static final int C2 = 0x1b873593; + public static final int DEFAULT_SEED = 42; + + private MurmurHashUtils() { + // do not instantiate + } + + /** + * Hash unsafe bytes, length must be aligned to 4 bytes. + * + * @param base base unsafe object + * @param offset offset for unsafe object + * @param lengthInBytes length in bytes + * @return hash code + */ + public static int hashUnsafeBytesByWords(Object base, long offset, int lengthInBytes) { + return hashUnsafeBytesByWords(base, offset, lengthInBytes, DEFAULT_SEED); + } + + /** + * Hash unsafe bytes. + * + * @param base base unsafe object + * @param offset offset for unsafe object + * @param lengthInBytes length in bytes + * @return hash code + */ + public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes) { + return hashUnsafeBytes(base, offset, lengthInBytes, DEFAULT_SEED); + } + + /** + * Hash bytes in MemorySegment, length must be aligned to 4 bytes. + * + * @param segment segment. + * @param offset offset for MemorySegment + * @param lengthInBytes length in MemorySegment + * @return hash code + */ + public static int hashBytesByWords(MemorySegment segment, int offset, int lengthInBytes) { + return hashBytesByWords(segment, offset, lengthInBytes, DEFAULT_SEED); + } + + /** + * Hash bytes in MemorySegment. + * + * @param segment segment. + * @param offset offset for MemorySegment + * @param lengthInBytes length in MemorySegment + * @return hash code + */ + public static int hashBytes(MemorySegment segment, int offset, int lengthInBytes) { + return hashBytes(segment, offset, lengthInBytes, DEFAULT_SEED); + } + + private static int hashUnsafeBytesByWords( + Object base, long offset, int lengthInBytes, int seed) { + int h1 = hashUnsafeBytesByInt(base, offset, lengthInBytes, seed); + return fmix(h1, lengthInBytes); + } + + private static int hashBytesByWords( + MemorySegment segment, int offset, int lengthInBytes, int seed) { + int h1 = hashBytesByInt(segment, offset, lengthInBytes, seed); + return fmix(h1, lengthInBytes); + } + + private static int hashBytes(MemorySegment segment, int offset, int lengthInBytes, int seed) { + int lengthAligned = lengthInBytes - lengthInBytes % 4; + int h1 = hashBytesByInt(segment, offset, lengthAligned, seed); + for (int i = lengthAligned; i < lengthInBytes; i++) { + int k1 = mixK1(segment.get(offset + i)); + h1 = mixH1(h1, k1); + } + return fmix(h1, lengthInBytes); + } + + private static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) { + assert (lengthInBytes >= 0) : "lengthInBytes cannot be negative"; + int lengthAligned = lengthInBytes - lengthInBytes % 4; + int h1 = hashUnsafeBytesByInt(base, offset, lengthAligned, seed); + for (int i = lengthAligned; i < lengthInBytes; i++) { + int halfWord = UNSAFE.getByte(base, offset + i); + int k1 = mixK1(halfWord); + h1 = mixH1(h1, k1); + } + return fmix(h1, lengthInBytes); + } + + private static int hashUnsafeBytesByInt(Object base, long offset, int lengthInBytes, int seed) { + assert (lengthInBytes % 4 == 0); + int h1 = seed; + for (int i = 0; i < lengthInBytes; i += 4) { + int halfWord = UNSAFE.getInt(base, offset + i); + int k1 = mixK1(halfWord); + h1 = mixH1(h1, k1); + } + return h1; + } + + private static int hashBytesByInt( + MemorySegment segment, int offset, int lengthInBytes, int seed) { + assert (lengthInBytes % 4 == 0); + int h1 = seed; + for (int i = 0; i < lengthInBytes; i += 4) { + int halfWord = segment.getInt(offset + i); + int k1 = mixK1(halfWord); + h1 = mixH1(h1, k1); + } + return h1; + } + + private static int mixK1(int k1) { + k1 *= C1; + k1 = Integer.rotateLeft(k1, 15); + k1 *= C2; + return k1; + } + + private static int mixH1(int h1, int k1) { + h1 ^= k1; + h1 = Integer.rotateLeft(h1, 13); + h1 = h1 * 5 + 0xe6546b64; + return h1; + } + + // Finalization mix - force all bits of a hash block to avalanche + private static int fmix(int h1, int length) { + h1 ^= length; + return fmix(h1); + } + + public static int fmix(int h) { + h ^= h >>> 16; + h *= 0x85ebca6b; + h ^= h >>> 13; + h *= 0xc2b2ae35; + h ^= h >>> 16; + return h; + } + + public static long fmix(long h) { + h ^= (h >>> 33); + h *= 0xff51afd7ed558ccdL; + h ^= (h >>> 33); + h *= 0xc4ceb9fe1a85ec53L; + h ^= (h >>> 33); + return h; + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/factories/Factory.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/factories/Factory.java new file mode 100644 index 00000000000..b8a13b7443c --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/factories/Factory.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.factories; + +import org.apache.flink.cdc.common.annotation.PublicEvolving; +import org.apache.flink.cdc.common.configuration.ConfigOption; +import org.apache.flink.cdc.common.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; + +import java.util.Set; + +/** + * Base interface for all kind of factories that create object instances from a list of key-value + * pairs in Flink CDC DataSource & DataSink API. + * + *

A factory is uniquely identified by {@link Class} and {@link #identifier()}. + * + *

The list of available factories is discovered using Java's Service Provider Interfaces (SPI). + * Classes that implement this interface can be added to {@code META_INF/services/Factory} in JAR + * files. + * + *

Every factory declares a set of required and optional options. This information will not be + * used during discovery but is helpful when generating documentation and performing validation. A + * factory may discover further (nested) factories, the options of the nested factories must not be + * declared in the sets of this factory. + * + *

It is the responsibility of each factory to perform validation before returning an instance. + */ +@PublicEvolving +public interface Factory { + + /** Returns a unique identifier among same factory interfaces. */ + String identifier(); + + /** + * Returns a set of {@link ConfigOption} that an implementation of this factory requires in + * addition to {@link #optionalOptions()}. + */ + Set> requiredOptions(); + + /** + * Returns a set of {@link ConfigOption} that an implementation of this factory consumes in + * addition to {@link #requiredOptions()}. + */ + Set> optionalOptions(); + + /** Provides session information describing the factory to be accessed. */ + @PublicEvolving + interface Context { + + /** + * Returns the factory options used to create the object instances. + * + * @return options of the current session. + */ + Configuration getFactoryConfiguration(); + + /** Returns the configuration of current pipeline. */ + Configuration getPipelineConfiguration(); + + /** + * Returns the class loader of the current session. + * + *

The class loader is in particular useful for discovering factories. + */ + ClassLoader getClassLoader(); + + /** Returns the flink configuration of the current session. */ + default ReadableConfig getFlinkConf() { + return new org.apache.flink.configuration.Configuration(); + } + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/factories/FactoryHelper.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/factories/FactoryHelper.java new file mode 100644 index 00000000000..8e4ff02276e --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/factories/FactoryHelper.java @@ -0,0 +1,224 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.factories; + +import org.apache.flink.cdc.common.annotation.PublicEvolving; +import org.apache.flink.cdc.common.configuration.ConfigOption; +import org.apache.flink.cdc.common.configuration.Configuration; +import org.apache.flink.cdc.common.configuration.FallbackKey; +import org.apache.flink.cdc.common.utils.Preconditions; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.table.api.ValidationException; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +/** A helper for working with {@link Factory}. */ +@PublicEvolving +public class FactoryHelper { + + private final Factory factory; + private final Factory.Context context; + + private FactoryHelper(Factory factory, Factory.Context context) { + this.factory = factory; + this.context = context; + } + + public static FactoryHelper createFactoryHelper(Factory factory, Factory.Context context) { + return new FactoryHelper(factory, context); + } + + /** + * Validates the required and optional {@link ConfigOption}s of a factory. + * + *

Note: It does not check for left-over options. + */ + public static void validateFactoryOptions(Factory factory, Configuration configuration) { + validateFactoryOptions(factory.requiredOptions(), factory.optionalOptions(), configuration); + } + + /** + * Validates the required options and optional options. + * + *

Note: It does not check for left-over options. + */ + public static void validateFactoryOptions( + Set> requiredOptions, + Set> optionalOptions, + Configuration configuration) { + final List missingRequiredOptions = + requiredOptions.stream() + .filter(option -> configuration.get(option) == null) + .flatMap(FactoryHelper::allKeys) + .sorted() + .collect(Collectors.toList()); + + if (!missingRequiredOptions.isEmpty()) { + throw new ValidationException( + String.format( + "One or more required options are missing.\n\n" + + "Missing required options are:\n\n" + + "%s", + String.join("\n", missingRequiredOptions))); + } + + optionalOptions.forEach(configuration::getOptional); + } + + /** Validates unconsumed option keys. */ + public static void validateUnconsumedKeys( + String factoryIdentifier, Set allOptionKeys, Set consumedOptionKeys) { + final Set remainingOptionKeys = new HashSet<>(allOptionKeys); + remainingOptionKeys.removeAll(consumedOptionKeys); + if (!remainingOptionKeys.isEmpty()) { + throw new ValidationException( + String.format( + "Unsupported options found for '%s'.\n\n" + + "Unsupported options:\n\n" + + "%s\n\n" + + "Supported options:\n\n" + + "%s", + factoryIdentifier, + remainingOptionKeys.stream().sorted().collect(Collectors.joining("\n")), + String.join("\n", consumedOptionKeys))); + } + } + + /** Validates the options of the factory. It checks for unconsumed option keys. */ + public void validate() { + Set allOptionKeys = + Stream.concat( + factory.requiredOptions().stream().flatMap(FactoryHelper::allKeys), + factory.optionalOptions().stream().flatMap(FactoryHelper::allKeys)) + .collect(Collectors.toSet()); + + validateFactoryOptions(factory, context.getFactoryConfiguration()); + validateUnconsumedKeys( + factory.identifier(), context.getFactoryConfiguration().getKeys(), allOptionKeys); + } + + /** + * Validates the options of the factory. It checks for unconsumed option keys while ignoring the + * options with given prefixes. + * + *

The option keys that have given prefix {@code prefixToSkip} would just be skipped for + * validation. + * + * @param prefixesToSkip Set of option key prefixes to skip validation + */ + public void validateExcept(String... prefixesToSkip) { + Preconditions.checkArgument( + prefixesToSkip.length > 0, "Prefixes to skip can not be empty."); + + final List prefixesList = Arrays.asList(prefixesToSkip); + + Set allOptionKeys = + Stream.concat( + factory.requiredOptions().stream().flatMap(FactoryHelper::allKeys), + factory.optionalOptions().stream().flatMap(FactoryHelper::allKeys)) + .collect(Collectors.toSet()); + + Set filteredOptionKeys = + context.getFactoryConfiguration().getKeys().stream() + .filter(key -> prefixesList.stream().noneMatch(key::startsWith)) + .collect(Collectors.toSet()); + + validateFactoryOptions(factory, context.getFactoryConfiguration()); + validateUnconsumedKeys(factory.identifier(), filteredOptionKeys, allOptionKeys); + } + + private static Stream allKeys(ConfigOption option) { + return Stream.concat( + Stream.of(option.key()), + StreamSupport.stream(option.fallbackKeys().spliterator(), false) + .map(FallbackKey::getKey)); + } + + public ReadableConfig getFormatConfig(String formatPrefix) { + final String prefix = formatPrefix + "."; + Map formatConfigMap = new HashMap<>(); + context.getFactoryConfiguration() + .toMap() + .forEach( + (k, v) -> { + if (k.startsWith(prefix)) { + formatConfigMap.put(k.substring(prefix.length()), v); + } + }); + return org.apache.flink.configuration.Configuration.fromMap(formatConfigMap); + } + + /** Default implementation of {@link Factory.Context}. */ + public static class DefaultContext implements Factory.Context { + + private final Configuration factoryConfiguration; + private final ClassLoader classLoader; + private final Configuration pipelineConfiguration; + private final ReadableConfig flinkConf; + + public DefaultContext( + Configuration factoryConfiguration, + Configuration pipelineConfiguration, + ClassLoader classLoader) { + this( + factoryConfiguration, + pipelineConfiguration, + classLoader, + new org.apache.flink.configuration.Configuration()); + } + + public DefaultContext( + Configuration factoryConfiguration, + Configuration pipelineConfiguration, + ClassLoader classLoader, + ReadableConfig flinkConf) { + this.factoryConfiguration = factoryConfiguration; + this.pipelineConfiguration = pipelineConfiguration; + this.classLoader = classLoader; + this.flinkConf = flinkConf; + } + + @Override + public Configuration getFactoryConfiguration() { + return factoryConfiguration; + } + + @Override + public Configuration getPipelineConfiguration() { + return pipelineConfiguration; + } + + @Override + public ClassLoader getClassLoader() { + return classLoader; + } + + @Override + public ReadableConfig getFlinkConf() { + return flinkConf; + } + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/route/TableIdRouter.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/route/TableIdRouter.java new file mode 100755 index 00000000000..816039939ef --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/route/TableIdRouter.java @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.route; + +import org.apache.flink.api.java.tuple.Tuple3; +import org.apache.flink.cdc.common.annotation.PublicEvolving; +import org.apache.flink.cdc.common.event.TableId; +import org.apache.flink.cdc.common.schema.Selectors; + +import org.apache.flink.shaded.guava33.com.google.common.cache.CacheBuilder; +import org.apache.flink.shaded.guava33.com.google.common.cache.CacheLoader; +import org.apache.flink.shaded.guava33.com.google.common.cache.LoadingCache; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nonnull; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; +import java.util.stream.Collectors; + +/** + * Calculates how upstream data change events should be dispatched to downstream tables. Returns one + * or many destination Table IDs based on provided routing rules. + */ +@PublicEvolving +public class TableIdRouter { + + private static final Logger LOG = LoggerFactory.getLogger(TableIdRouter.class); + private static final Duration CACHE_EXPIRE_DURATION = Duration.ofDays(1); + + private final List> routes; + private final LoadingCache> routingCache; + + private static final String DOT_PLACEHOLDER = "_dot_placeholder_"; + + /** + * Currently, The supported regular syntax is not exactly the same in {@link Selectors}. + * + *

The main discrepancies are : + * + *

1) {@link Selectors} use {@code ,} to split table names instead of `|`. + * + *

2) If there is a need to use a dot ({@code .}) in a regular expression to match any + * character, it is necessary to escape the dot with a backslash. + * + *

3) The unescaped {@code .} is used as the separator of database and table name. When + * converting to Debezium style, it is expected to be escaped to match the dot ({@code .}) + * literally instead of the meta-character. + */ + public static String convertTableListToRegExpPattern(String tables) { + LOG.info("Rewriting CDC style table capture list: {}", tables); + + // In CDC-style table matching, table names could be separated by `,` character. + // Convert it to `|` as it's standard RegEx syntax. + tables = + Arrays.stream(tables.split(",")).map(String::trim).collect(Collectors.joining("|")); + LOG.info("Expression after replacing comma with vert separator: {}", tables); + + // Essentially, we're just trying to swap escaped `\\.` and unescaped `.`. + // In our table matching syntax, `\\.` means RegEx token matcher and `.` means database & + // table name separator. + // On the contrary, while we're matching TableId string, `\\.` means matching the "dot" + // literal and `.` is the meta-character. + + // Step 1: escape the dot with a backslash, but keep it as a placeholder (like `$`). + // For example, `db\.*.tbl\.*` => `db$*.tbl$*` + String unescapedTables = tables.replace("\\.", DOT_PLACEHOLDER); + LOG.info("Expression after un-escaping dots as RegEx meta-character: {}", unescapedTables); + + // Step 2: replace all remaining dots (`.`) to quoted version (`\.`), as a separator between + // database and table names. + // For example, `db$*.tbl$*` => `db$*\.tbl$*` + String unescapedTablesWithDbTblSeparator = unescapedTables.replace(".", "\\."); + LOG.info("Re-escaping dots as TableId delimiter: {}", unescapedTablesWithDbTblSeparator); + + // Step 3: restore placeholder to normal RegEx matcher (`.`) + // For example, `db$*\.tbl$*` => `db.*\.tbl.*` + String standardRegExpTableCaptureList = + unescapedTablesWithDbTblSeparator.replace(DOT_PLACEHOLDER, "."); + LOG.info("Final standard RegExp table capture list: {}", standardRegExpTableCaptureList); + + return standardRegExpTableCaptureList; + } + + public TableIdRouter(List routingRules) { + this.routes = new ArrayList<>(); + for (RouteRule rule : routingRules) { + try { + routes.add( + new Tuple3<>( + Pattern.compile(convertTableListToRegExpPattern(rule.sourceTable)), + rule.sinkTable, + rule.replaceSymbol)); + } catch (PatternSyntaxException e) { + throw new IllegalArgumentException( + String.format( + "Failed to parse regular expression in routing rule %s. Notice that `.` is used to separate Table ID components. To use it as a regex token, put a `\\` before to escape it.", + rule), + e); + } + } + this.routingCache = + CacheBuilder.newBuilder() + .expireAfterAccess(CACHE_EXPIRE_DURATION) + .build( + new CacheLoader>() { + @Override + public @Nonnull List load(@Nonnull TableId key) { + return calculateRoute(key); + } + }); + } + + public List route(TableId sourceTableId) { + return routingCache.getUnchecked(sourceTableId); + } + + private List calculateRoute(TableId sourceTableId) { + List routedTableIds = + routes.stream() + .filter(route -> matches(route.f0, sourceTableId)) + .map(route -> resolveReplacement(sourceTableId, route)) + .collect(Collectors.toList()); + if (routedTableIds.isEmpty()) { + routedTableIds.add(sourceTableId); + } + return routedTableIds; + } + + private TableId resolveReplacement( + TableId originalTable, Tuple3 route) { + if (route.f2 != null) { + return TableId.parse(route.f1.replace(route.f2, originalTable.getTableName())); + } else { + Matcher matcher = route.f0.matcher(originalTable.toString()); + if (matcher.find()) { + return TableId.parse(matcher.replaceAll(route.f1)); + } + } + return TableId.parse(route.f1); + } + + /** + * Group the source tables that conform to the same routing rule together. The total number of + * groups is less than or equal to the number of routing rules. For the source tables within + * each group, their table structures will be merged to obtain the widest table structure in + * that group. The structures of all tables within the group will be expanded to this widest + * table structure. + * + * @param tableIdSet The tables need to be grouped by the router + * @return The tables grouped by the router + */ + public List> groupSourceTablesByRouteRule(Set tableIdSet) { + if (routes.isEmpty()) { + return new ArrayList<>(); + } + return routes.stream() + .map( + route -> + tableIdSet.stream() + .filter(tableId -> matches(route.f0, tableId)) + .collect(Collectors.toSet())) + .collect(Collectors.toList()); + } + + private static boolean matches(Pattern pattern, TableId tableId) { + return pattern.matcher(tableId.toString()).matches(); + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/schema/Selectors.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/schema/Selectors.java new file mode 100644 index 00000000000..21bc1c2ee21 --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/schema/Selectors.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.schema; + +import org.apache.flink.cdc.common.event.TableId; +import org.apache.flink.cdc.common.utils.Predicates; + +import org.apache.flink.shaded.guava33.com.google.common.cache.Cache; +import org.apache.flink.shaded.guava33.com.google.common.cache.CacheBuilder; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.function.Predicate; + +/** Selectors for filtering tables. */ +public class Selectors { + + private static final Duration CACHE_EXPIRE_DURATION = Duration.ofHours(1); + + private List selectors; + + private final Cache cache = + CacheBuilder.newBuilder() + .expireAfterAccess(CACHE_EXPIRE_DURATION) + .maximumSize(1024) + .build(); + + private Selectors() {} + + /** + * A {@link Selector} that determines whether a table identified by a given {@link TableId} is + * to be included. + */ + private static class Selector { + private final Predicate namespacePred; + private final Predicate schemaNamePred; + private final Predicate tableNamePred; + + public Selector(String namespace, String schemaName, String tableName) { + this.namespacePred = + namespace == null ? (namespacePred) -> false : Predicates.includes(namespace); + this.schemaNamePred = + schemaName == null + ? (schemaNamePred) -> false + : Predicates.includes(schemaName); + this.tableNamePred = + tableName == null ? (tableNamePred) -> false : Predicates.includes(tableName); + } + + public boolean isMatch(TableId tableId) { + + String namespace = tableId.getNamespace(); + String schemaName = tableId.getSchemaName(); + + if (namespace == null || namespace.isEmpty()) { + if (schemaName == null || schemaName.isEmpty()) { + return tableNamePred.test(tableId.getTableName()); + } + return schemaNamePred.test(tableId.getSchemaName()) + && tableNamePred.test(tableId.getTableName()); + } + return namespacePred.test(tableId.getNamespace()) + && schemaNamePred.test(tableId.getSchemaName()) + && tableNamePred.test(tableId.getTableName()); + } + } + + /** Match the {@link TableId} against the {@link Selector}s. */ + public boolean isMatch(TableId tableId) { + Boolean cachedResult = cache.getIfPresent(tableId); + if (cachedResult != null) { + return cachedResult; + } + + boolean match = computeIsMatch(tableId); + cache.put(tableId, match); + return match; + } + + /** Computes the match result if not present in the cache. */ + private boolean computeIsMatch(TableId tableId) { + for (Selector selector : selectors) { + if (selector.isMatch(tableId)) { + return true; + } + } + return false; + } + + /** Builder for {@link Selectors}. */ + public static class SelectorsBuilder { + + private List selectors; + + public SelectorsBuilder includeTables(String tableInclusions) { + + if (tableInclusions == null || tableInclusions.isEmpty()) { + throw new IllegalArgumentException( + "Invalid table inclusion pattern cannot be null or empty"); + } + + List selectors = new ArrayList<>(); + Set tableSplitSet = + Predicates.setOf( + tableInclusions, Predicates.RegExSplitterByComma::split, (str) -> str); + for (String tableSplit : tableSplitSet) { + List tableIdList = + Predicates.listOf( + tableSplit, Predicates.RegExSplitterByDot::split, (str) -> str); + Iterator iterator = tableIdList.iterator(); + if (tableIdList.size() == 1) { + selectors.add(new Selector(null, null, iterator.next())); + } else if (tableIdList.size() == 2) { + selectors.add(new Selector(null, iterator.next(), iterator.next())); + } else if (tableIdList.size() == 3) { + selectors.add(new Selector(iterator.next(), iterator.next(), iterator.next())); + } else { + throw new IllegalArgumentException( + "Invalid table inclusion pattern: " + tableInclusions); + } + } + this.selectors = selectors; + return this; + } + + public Selectors build() { + Selectors selectors = new Selectors(); + selectors.selectors = this.selectors; + return selectors; + } + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/sink/EventSinkProvider.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/sink/EventSinkProvider.java new file mode 100644 index 00000000000..5cb8fb6e2bd --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/sink/EventSinkProvider.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.sink; + +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.cdc.common.annotation.PublicEvolving; +import org.apache.flink.streaming.api.functions.sink.legacy.SinkFunction; + +/** + * A marker interface used to provide an event sink for writing change events to external systems. + * We can reuse exiting Flink {@link Sink} and Flink {@link SinkFunction} implementation, and we can + * support our own {@code EventSink} Implementation in the future. + */ +@PublicEvolving +public interface EventSinkProvider {} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/sink/FlinkSinkFunctionProvider.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/sink/FlinkSinkFunctionProvider.java new file mode 100644 index 00000000000..8fcc11fe55f --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/sink/FlinkSinkFunctionProvider.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.sink; + +import org.apache.flink.cdc.common.annotation.PublicEvolving; +import org.apache.flink.cdc.common.event.Event; +import org.apache.flink.streaming.api.functions.sink.legacy.SinkFunction; + +/** + * {@code FlinkSinkFunctionProvider} is used to provide a Flink {@link SinkFunction} for writing + * events to external systems. + */ +@PublicEvolving +public interface FlinkSinkFunctionProvider extends EventSinkProvider { + + /** Get the {@link SinkFunction} for writing events to external systems. */ + SinkFunction getSinkFunction(); + + /** Create a {@link FlinkSinkFunctionProvider} from a {@link SinkFunction}. */ + static FlinkSinkFunctionProvider of(SinkFunction sinkFunction) { + return () -> sinkFunction; + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/sink/FlinkSinkProvider.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/sink/FlinkSinkProvider.java new file mode 100644 index 00000000000..9daac630a4a --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/sink/FlinkSinkProvider.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.sink; + +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.cdc.common.annotation.PublicEvolving; +import org.apache.flink.cdc.common.event.Event; + +/** + * {@code FlinkSinkProvider} is used to provide a Flink {@link Sink} for writing events to external + * systems. + */ +@PublicEvolving +public interface FlinkSinkProvider extends EventSinkProvider { + + /** Get the {@link Sink} for writing events to external systems. */ + Sink getSink(); + + /** Create a {@link FlinkSinkProvider} from a {@link Sink}. */ + static FlinkSinkProvider of(Sink sink) { + return () -> sink; + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/source/EventSourceProvider.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/source/EventSourceProvider.java new file mode 100644 index 00000000000..9ec4a8a1a5b --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/source/EventSourceProvider.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.source; + +import org.apache.flink.api.connector.source.Source; +import org.apache.flink.cdc.common.annotation.PublicEvolving; +import org.apache.flink.streaming.api.functions.source.legacy.SourceFunction; + +/** + * A marker interface used to provide an event source for reading events from external systems. We + * can reuse exiting Flink {@link Source} and Flink {@link SourceFunction} implementation, and we + * can support our own {@code EventSource} implementation in the future. + */ +@PublicEvolving +public interface EventSourceProvider {} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/source/FlinkSourceFunctionProvider.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/source/FlinkSourceFunctionProvider.java new file mode 100644 index 00000000000..a78f2e1f94f --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/source/FlinkSourceFunctionProvider.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.source; + +import org.apache.flink.cdc.common.annotation.PublicEvolving; +import org.apache.flink.cdc.common.event.Event; +import org.apache.flink.streaming.api.functions.source.legacy.SourceFunction; + +/** + * {@code FlinkSourceFunctionProvider} is used to provide a Flink {@link SourceFunction} for reading + * events from external systems. + */ +@PublicEvolving +public interface FlinkSourceFunctionProvider extends EventSourceProvider { + + /** Get the {@link SourceFunction} for reading events from external systems. */ + SourceFunction getSourceFunction(); + + /** Create a {@link FlinkSourceFunctionProvider} from a {@link SourceFunction}. */ + static FlinkSourceFunctionProvider of(SourceFunction sourceFunction) { + return () -> sourceFunction; + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/source/FlinkSourceProvider.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/source/FlinkSourceProvider.java new file mode 100644 index 00000000000..6a3e35f0be9 --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/source/FlinkSourceProvider.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.source; + +import org.apache.flink.api.connector.source.Source; +import org.apache.flink.cdc.common.annotation.PublicEvolving; +import org.apache.flink.cdc.common.event.Event; + +/** + * {@code FlinkSourceProvider} is used to provide a Flink {@link Source} for reading events from + * external systems. + */ +@PublicEvolving +public interface FlinkSourceProvider extends EventSourceProvider { + + /** Get the {@link Source} for reading events from external systems. */ + Source getSource(); + + /** Create a {@link FlinkSourceProvider} from a {@link Source}. */ + static FlinkSourceProvider of(Source source) { + return () -> source; + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/types/DataField.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/types/DataField.java new file mode 100644 index 00000000000..68e2d17eba7 --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/types/DataField.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.types; + +import org.apache.flink.cdc.common.annotation.PublicEvolving; +import org.apache.flink.cdc.common.types.utils.DataTypeUtils; +import org.apache.flink.cdc.common.utils.Preconditions; +import org.apache.flink.table.types.utils.LogicalTypeDataTypeConverter; + +import javax.annotation.Nullable; + +import java.io.Serializable; +import java.util.Objects; + +import static org.apache.flink.cdc.common.utils.EncodingUtils.escapeIdentifier; +import static org.apache.flink.cdc.common.utils.EncodingUtils.escapeSingleQuotes; + +/** + * Defines the field of a row type. + * + * @see RowType + */ +@PublicEvolving +public class DataField implements Serializable { + + private static final long serialVersionUID = 1L; + + public static final String FIELD_FORMAT_WITH_DESCRIPTION = "%s %s '%s'"; + + public static final String FIELD_FORMAT_NO_DESCRIPTION = "%s %s"; + + private final String name; + + private final DataType type; + + private final @Nullable String description; + + public DataField(String name, DataType type, @Nullable String description) { + this.name = Preconditions.checkNotNull(name, "Field name must not be null."); + this.type = Preconditions.checkNotNull(type, "Field type must not be null."); + this.description = description; + } + + public DataField(String name, DataType type) { + this(name, type, null); + } + + public String getName() { + return name; + } + + public DataType getType() { + return type; + } + + @Nullable + public String getDescription() { + return description; + } + + public DataField copy() { + return new DataField(name, type.copy(), description); + } + + public String asSummaryString() { + return formatString(type.asSummaryString(), true); + } + + public String asSerializableString() { + return formatString(type.asSerializableString(), false); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + DataField rowField = (DataField) o; + return name.equals(rowField.name) + && type.equals(rowField.type) + && Objects.equals(description, rowField.description); + } + + @Override + public int hashCode() { + return Objects.hash(name, type, description); + } + + private String formatString(String typeString, boolean excludeDescription) { + if (description == null) { + return String.format(FIELD_FORMAT_NO_DESCRIPTION, escapeIdentifier(name), typeString); + } else if (excludeDescription) { + return String.format( + FIELD_FORMAT_WITH_DESCRIPTION, escapeIdentifier(name), typeString, "..."); + } else { + return String.format( + FIELD_FORMAT_WITH_DESCRIPTION, + escapeIdentifier(name), + typeString, + escapeSingleQuotes(description)); + } + } + + public org.apache.flink.table.api.DataTypes.Field toFlinkDataTypeField() { + return description == null + ? org.apache.flink.table.api.DataTypes.FIELD( + name, DataTypeUtils.toFlinkDataType(type)) + : org.apache.flink.table.api.DataTypes.FIELD( + name, DataTypeUtils.toFlinkDataType(type), description); + } + + public static DataField fromFlinkDataTypeField( + org.apache.flink.table.types.logical.RowType.RowField rowField) { + return DataTypes.FIELD( + rowField.getName(), + DataTypeUtils.fromFlinkDataType( + LogicalTypeDataTypeConverter.toDataType(rowField.getType())), + rowField.getDescription().orElse(null)); + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/types/utils/DataTypeUtils.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/types/utils/DataTypeUtils.java new file mode 100644 index 00000000000..e712f838400 --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/types/utils/DataTypeUtils.java @@ -0,0 +1,306 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.types.utils; + +import org.apache.flink.cdc.common.data.ArrayData; +import org.apache.flink.cdc.common.data.DateData; +import org.apache.flink.cdc.common.data.DecimalData; +import org.apache.flink.cdc.common.data.MapData; +import org.apache.flink.cdc.common.data.RecordData; +import org.apache.flink.cdc.common.data.StringData; +import org.apache.flink.cdc.common.data.TimeData; +import org.apache.flink.cdc.common.data.TimestampData; +import org.apache.flink.cdc.common.data.ZonedTimestampData; +import org.apache.flink.cdc.common.types.DataField; +import org.apache.flink.cdc.common.types.DataType; +import org.apache.flink.cdc.common.types.DataTypes; +import org.apache.flink.cdc.common.types.RowType; +import org.apache.flink.cdc.common.utils.Preconditions; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.util.CollectionUtil; + +import java.util.List; +import java.util.stream.Collectors; + +import static org.apache.flink.table.types.logical.utils.LogicalTypeChecks.getLength; +import static org.apache.flink.table.types.logical.utils.LogicalTypeChecks.getPrecision; +import static org.apache.flink.table.types.logical.utils.LogicalTypeChecks.getScale; + +/** Utilities for handling {@link DataType}s. */ +public class DataTypeUtils { + /** + * Returns the conversion class for the given {@link DataType} that is used by the table runtime + * as internal data structure. + */ + public static Class toInternalConversionClass(DataType type) { + // ordered by type root definition + switch (type.getTypeRoot()) { + case CHAR: + case VARCHAR: + return StringData.class; + case BOOLEAN: + return Boolean.class; + case BINARY: + case VARBINARY: + return byte[].class; + case DECIMAL: + return DecimalData.class; + case TINYINT: + return Byte.class; + case SMALLINT: + return Short.class; + case INTEGER: + return Integer.class; + case DATE: + return DateData.class; + case TIME_WITHOUT_TIME_ZONE: + return TimeData.class; + case BIGINT: + return Long.class; + case FLOAT: + return Float.class; + case DOUBLE: + return Double.class; + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + return TimestampData.class; + case TIMESTAMP_WITH_TIME_ZONE: + return ZonedTimestampData.class; + case ARRAY: + return ArrayData.class; + case MAP: + return MapData.class; + case ROW: + return RecordData.class; + default: + throw new IllegalArgumentException("Illegal type: " + type); + } + } + + /** + * Convert CDC's {@link DataType} to Flink's internal {@link + * org.apache.flink.table.types.DataType}. + */ + public static org.apache.flink.table.types.DataType toFlinkDataType(DataType type) { + // ordered by type root definition + List children = type.getChildren(); + int length = DataTypes.getLength(type).orElse(0); + int precision = DataTypes.getPrecision(type).orElse(0); + int scale = DataTypes.getScale(type).orElse(0); + switch (type.getTypeRoot()) { + case CHAR: + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.CHAR(length) + : org.apache.flink.table.api.DataTypes.CHAR(length).notNull(); + case VARCHAR: + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.VARCHAR(length) + : org.apache.flink.table.api.DataTypes.VARCHAR(length).notNull(); + case BOOLEAN: + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.BOOLEAN() + : org.apache.flink.table.api.DataTypes.BOOLEAN().notNull(); + case BINARY: + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.BINARY(length) + : org.apache.flink.table.api.DataTypes.BINARY(length).notNull(); + case VARBINARY: + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.VARBINARY(length) + : org.apache.flink.table.api.DataTypes.VARBINARY(length).notNull(); + case DECIMAL: + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.DECIMAL(precision, scale) + : org.apache.flink.table.api.DataTypes.DECIMAL(precision, scale).notNull(); + case TINYINT: + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.TINYINT() + : org.apache.flink.table.api.DataTypes.TINYINT().notNull(); + case SMALLINT: + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.SMALLINT() + : org.apache.flink.table.api.DataTypes.SMALLINT().notNull(); + case INTEGER: + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.INT() + : org.apache.flink.table.api.DataTypes.INT().notNull(); + case DATE: + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.DATE() + : org.apache.flink.table.api.DataTypes.DATE().notNull(); + case TIME_WITHOUT_TIME_ZONE: + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.TIME(precision) + : org.apache.flink.table.api.DataTypes.TIME(precision).notNull(); + case BIGINT: + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.BIGINT() + : org.apache.flink.table.api.DataTypes.BIGINT().notNull(); + case FLOAT: + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.FLOAT() + : org.apache.flink.table.api.DataTypes.FLOAT().notNull(); + case DOUBLE: + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.DOUBLE() + : org.apache.flink.table.api.DataTypes.DOUBLE().notNull(); + case TIMESTAMP_WITHOUT_TIME_ZONE: + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.TIMESTAMP(precision) + : org.apache.flink.table.api.DataTypes.TIMESTAMP(precision).notNull(); + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE( + precision) + : org.apache.flink.table.api.DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE( + precision) + .notNull(); + case TIMESTAMP_WITH_TIME_ZONE: + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.TIMESTAMP_WITH_TIME_ZONE(precision) + : org.apache.flink.table.api.DataTypes.TIMESTAMP_WITH_TIME_ZONE(precision) + .notNull(); + case ARRAY: + Preconditions.checkState(children != null && !children.isEmpty()); + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.ARRAY( + toFlinkDataType(children.get(0))) + : org.apache.flink.table.api.DataTypes.ARRAY( + toFlinkDataType(children.get(0))) + .notNull(); + case MAP: + Preconditions.checkState(children != null && children.size() > 1); + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.MAP( + toFlinkDataType(children.get(0)), toFlinkDataType(children.get(1))) + : org.apache.flink.table.api.DataTypes.MAP( + toFlinkDataType(children.get(0)), + toFlinkDataType(children.get(1))) + .notNull(); + case ROW: + Preconditions.checkState(!CollectionUtil.isNullOrEmpty(children)); + RowType rowType = (RowType) type; + List fields = + rowType.getFields().stream() + .map(DataField::toFlinkDataTypeField) + .collect(Collectors.toList()); + return type.isNullable() + ? org.apache.flink.table.api.DataTypes.ROW(fields) + : org.apache.flink.table.api.DataTypes.ROW(fields).notNull(); + default: + throw new IllegalArgumentException("Illegal type: " + type); + } + } + + /** + * Convert Flink's internal {@link org.apache.flink.table.types.DataType} to CDC's {@link + * DataType}. + */ + public static DataType fromFlinkDataType(org.apache.flink.table.types.DataType flinkType) { + LogicalType logicalType = flinkType.getLogicalType(); + List children = flinkType.getChildren(); + DataType dataType; + switch (logicalType.getTypeRoot()) { + case CHAR: + dataType = DataTypes.CHAR(getLength(logicalType)); + break; + case VARCHAR: + dataType = DataTypes.VARCHAR(getLength(logicalType)); + break; + case BOOLEAN: + dataType = DataTypes.BOOLEAN(); + break; + case BINARY: + dataType = DataTypes.BINARY(getLength(logicalType)); + break; + case VARBINARY: + dataType = DataTypes.VARBINARY(getLength(logicalType)); + break; + case DECIMAL: + dataType = DataTypes.DECIMAL(getPrecision(logicalType), getScale(logicalType)); + break; + case TINYINT: + dataType = DataTypes.TINYINT(); + break; + case SMALLINT: + dataType = DataTypes.SMALLINT(); + break; + case INTEGER: + dataType = DataTypes.INT(); + break; + case BIGINT: + dataType = DataTypes.BIGINT(); + break; + case FLOAT: + dataType = DataTypes.FLOAT(); + break; + case DOUBLE: + dataType = DataTypes.DOUBLE(); + break; + case DATE: + dataType = DataTypes.DATE(); + break; + case TIME_WITHOUT_TIME_ZONE: + dataType = DataTypes.TIME(getPrecision(logicalType)); + break; + case TIMESTAMP_WITHOUT_TIME_ZONE: + dataType = DataTypes.TIMESTAMP(getPrecision(logicalType)); + break; + case TIMESTAMP_WITH_TIME_ZONE: + dataType = DataTypes.TIMESTAMP_TZ(getPrecision(logicalType)); + break; + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + dataType = DataTypes.TIMESTAMP_LTZ(getPrecision(logicalType)); + break; + case ARRAY: + Preconditions.checkState(children != null && !children.isEmpty()); + dataType = DataTypes.ARRAY(fromFlinkDataType(children.get(0))); + break; + case MAP: + Preconditions.checkState(children != null && children.size() > 1); + dataType = + DataTypes.MAP( + fromFlinkDataType(children.get(0)), + fromFlinkDataType(children.get(1))); + break; + case ROW: + Preconditions.checkState(!CollectionUtil.isNullOrEmpty(children)); + org.apache.flink.table.types.logical.RowType rowType = + (org.apache.flink.table.types.logical.RowType) flinkType.getLogicalType(); + DataField[] fields = + rowType.getFields().stream() + .map(DataField::fromFlinkDataTypeField) + .toArray(DataField[]::new); + dataType = DataTypes.ROW(fields); + break; + case INTERVAL_YEAR_MONTH: + case INTERVAL_DAY_TIME: + case NULL: + case MULTISET: + case DISTINCT_TYPE: + case STRUCTURED_TYPE: + case RAW: + case SYMBOL: + case UNRESOLVED: + throw new IllegalArgumentException("Unsupported type: " + flinkType); + default: + throw new IllegalArgumentException("Illegal type: " + flinkType); + } + return logicalType.isNullable() ? dataType : dataType.notNull(); + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/types/utils/runtime/DataInputViewStream.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/types/utils/runtime/DataInputViewStream.java new file mode 100644 index 00000000000..8fecafba71a --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/types/utils/runtime/DataInputViewStream.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.types.utils.runtime; + +import org.apache.flink.core.memory.DataInputView; + +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; + +/** An input stream that draws its data from a {@link DataInputView}. */ +public class DataInputViewStream extends InputStream { + + protected DataInputView inputView; + + public DataInputViewStream(DataInputView inputView) { + this.inputView = inputView; + } + + public DataInputView getInputView() { + return inputView; + } + + @Override + public int read() throws IOException { + try { + return inputView.readUnsignedByte(); + } catch (EOFException ex) { + return -1; + } + } + + @Override + public long skip(long n) throws IOException { + long toSkipRemaining = n; + while (toSkipRemaining > Integer.MAX_VALUE) { + int skippedBytes = inputView.skipBytes(Integer.MAX_VALUE); + + if (skippedBytes == 0) { + return n - toSkipRemaining; + } + + toSkipRemaining -= skippedBytes; + } + return n - (toSkipRemaining - inputView.skipBytes((int) toSkipRemaining)); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + return inputView.read(b, off, len); + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/types/utils/runtime/DataOutputViewStream.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/types/utils/runtime/DataOutputViewStream.java new file mode 100644 index 00000000000..5798ceb123a --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/types/utils/runtime/DataOutputViewStream.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.types.utils.runtime; + +import org.apache.flink.core.memory.DataOutputView; + +import java.io.IOException; +import java.io.OutputStream; + +/** An output stream that writes its data to a {@link DataOutputView}. */ +public class DataOutputViewStream extends OutputStream { + protected DataOutputView outputView; + + public DataOutputViewStream(DataOutputView outputView) { + this.outputView = outputView; + } + + @Override + public void write(int b) throws IOException { + outputView.writeByte(b); + } + + @Override + public void write(byte[] b, int off, int len) throws IOException { + outputView.write(b, off, len); + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/types/variant/BinaryVariant.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/types/variant/BinaryVariant.java new file mode 100644 index 00000000000..5aab5369755 --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/types/variant/BinaryVariant.java @@ -0,0 +1,500 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.types.variant; + +import org.apache.flink.cdc.common.annotation.Internal; + +import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.core.JsonFactory; +import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.core.JsonGenerator; + +import java.io.CharArrayWriter; +import java.io.IOException; +import java.math.BigDecimal; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.ZoneOffset; +import java.time.temporal.ChronoUnit; +import java.util.Arrays; +import java.util.Base64; +import java.util.Objects; + +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.BINARY_SEARCH_THRESHOLD; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.SIZE_LIMIT; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.TIMESTAMP_FORMATTER; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.TIMESTAMP_LTZ_FORMATTER; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.VERSION; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.VERSION_MASK; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.checkIndex; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.getMetadataKey; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.handleArray; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.handleObject; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.malformedVariant; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.readUnsigned; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.unexpectedType; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.valueSize; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.variantConstructorSizeLimit; + +/** + * Copy from BinaryVariant.java. + * + *

A data structure that represents a semi-structured value. It consists of two binary values: + * value and metadata. The value encodes types and values, but not field names. The metadata + * currently contains a version flag and a list of field names. We can extend/modify the detailed + * binary format given the version flag. + * + * @see Variant + * Binary Encoding for the detail layout of the data structure. + */ +@Internal +public final class BinaryVariant implements Variant { + + private final byte[] value; + private final byte[] metadata; + // The variant value doesn't use the whole `value` binary, but starts from its `pos` index and + // spans a size of `valueSize(value, pos)`. This design avoids frequent copies of the value + // binary when reading a sub-variant in the array/object element. + private final int pos; + + public BinaryVariant(byte[] value, byte[] metadata) { + this(value, metadata, 0); + } + + private BinaryVariant(byte[] value, byte[] metadata, int pos) { + this.value = value; + this.metadata = metadata; + this.pos = pos; + // There is currently only one allowed version. + if (metadata.length < 1 || (metadata[0] & VERSION_MASK) != VERSION) { + throw malformedVariant(); + } + // Don't attempt to use a Variant larger than 16 MiB. We'll never produce one, and it risks + // memory instability. + if (metadata.length > SIZE_LIMIT || value.length > SIZE_LIMIT) { + throw variantConstructorSizeLimit(); + } + } + + @Override + public boolean isPrimitive() { + return !isArray() && !isObject(); + } + + @Override + public boolean isArray() { + return getType() == Type.ARRAY; + } + + @Override + public boolean isObject() { + return getType() == Type.OBJECT; + } + + @Override + public boolean isNull() { + return getType() == Type.NULL; + } + + @Override + public Type getType() { + return BinaryVariantUtil.getType(value, pos); + } + + @Override + public boolean getBoolean() throws VariantTypeException { + checkType(Type.BOOLEAN, getType()); + return BinaryVariantUtil.getBoolean(value, pos); + } + + @Override + public byte getByte() throws VariantTypeException { + checkType(Type.TINYINT, getType()); + return (byte) BinaryVariantUtil.getLong(value, pos); + } + + @Override + public short getShort() throws VariantTypeException { + checkType(Type.SMALLINT, getType()); + return (short) BinaryVariantUtil.getLong(value, pos); + } + + @Override + public int getInt() throws VariantTypeException { + checkType(Type.INT, getType()); + return (int) BinaryVariantUtil.getLong(value, pos); + } + + @Override + public long getLong() throws VariantTypeException { + checkType(Type.BIGINT, getType()); + return BinaryVariantUtil.getLong(value, pos); + } + + @Override + public float getFloat() throws VariantTypeException { + checkType(Type.FLOAT, getType()); + return BinaryVariantUtil.getFloat(value, pos); + } + + @Override + public BigDecimal getDecimal() throws VariantTypeException { + checkType(Type.DECIMAL, getType()); + return BinaryVariantUtil.getDecimal(value, pos); + } + + @Override + public double getDouble() throws VariantTypeException { + checkType(Type.DOUBLE, getType()); + return BinaryVariantUtil.getDouble(value, pos); + } + + @Override + public String getString() throws VariantTypeException { + checkType(Type.STRING, getType()); + return BinaryVariantUtil.getString(value, pos); + } + + @Override + public LocalDate getDate() throws VariantTypeException { + checkType(Type.DATE, getType()); + return LocalDate.ofEpochDay(BinaryVariantUtil.getLong(value, pos)); + } + + @Override + public LocalDateTime getDateTime() throws VariantTypeException { + checkType(Type.TIMESTAMP, getType()); + return microsToInstant(BinaryVariantUtil.getLong(value, pos)) + .atZone(ZoneOffset.UTC) + .toLocalDateTime(); + } + + @Override + public Instant getInstant() throws VariantTypeException { + checkType(Type.TIMESTAMP_LTZ, getType()); + return microsToInstant(BinaryVariantUtil.getLong(value, pos)); + } + + @Override + public byte[] getBytes() throws VariantTypeException { + checkType(Type.BYTES, getType()); + return BinaryVariantUtil.getBinary(value, pos); + } + + @Override + public Object get() throws VariantTypeException { + switch (getType()) { + case NULL: + return null; + case BOOLEAN: + return getBoolean(); + case TINYINT: + return getByte(); + case SMALLINT: + return getShort(); + case INT: + return getInt(); + case BIGINT: + return getLong(); + case FLOAT: + return getFloat(); + case DOUBLE: + return getDouble(); + case DECIMAL: + return getDecimal(); + case STRING: + return getString(); + case DATE: + return getDate(); + case TIMESTAMP: + return getDateTime(); + case TIMESTAMP_LTZ: + return getInstant(); + case BYTES: + return getBytes(); + default: + throw new VariantTypeException( + String.format("Expecting a primitive variant but got %s", getType())); + } + } + + @Override + public T getAs() throws VariantTypeException { + return (T) get(); + } + + @Override + public Variant getElement(int index) throws VariantTypeException { + return getElementAtIndex(index); + } + + @Override + public Variant getField(String fieldName) throws VariantTypeException { + return getFieldByKey(fieldName); + } + + @Override + public String toJson() { + StringBuilder sb = new StringBuilder(); + toJsonImpl(value, metadata, pos, sb, ZoneOffset.UTC); + return sb.toString(); + } + + public byte[] getValue() { + if (pos == 0) { + return value; + } + int size = valueSize(value, pos); + checkIndex(pos + size - 1, value.length); + return Arrays.copyOfRange(value, pos, pos + size); + } + + public byte[] getMetadata() { + return metadata; + } + + public int getPos() { + return pos; + } + + private static void toJsonImpl( + byte[] value, byte[] metadata, int pos, StringBuilder sb, ZoneId zoneId) { + switch (BinaryVariantUtil.getType(value, pos)) { + case OBJECT: + handleObject( + value, + pos, + (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + sb.append('{'); + for (int i = 0; i < size; ++i) { + int id = readUnsigned(value, idStart + idSize * i, idSize); + int offset = + readUnsigned( + value, offsetStart + offsetSize * i, offsetSize); + int elementPos = dataStart + offset; + if (i != 0) { + sb.append(','); + } + sb.append(escapeJson(getMetadataKey(metadata, id))); + sb.append(':'); + toJsonImpl(value, metadata, elementPos, sb, zoneId); + } + sb.append('}'); + return null; + }); + break; + case ARRAY: + handleArray( + value, + pos, + (size, offsetSize, offsetStart, dataStart) -> { + sb.append('['); + for (int i = 0; i < size; ++i) { + int offset = + readUnsigned( + value, offsetStart + offsetSize * i, offsetSize); + int elementPos = dataStart + offset; + if (i != 0) { + sb.append(','); + } + toJsonImpl(value, metadata, elementPos, sb, zoneId); + } + sb.append(']'); + return null; + }); + break; + case NULL: + sb.append("null"); + break; + case BOOLEAN: + sb.append(BinaryVariantUtil.getBoolean(value, pos)); + break; + case TINYINT: + case SMALLINT: + case INT: + case BIGINT: + sb.append(BinaryVariantUtil.getLong(value, pos)); + break; + case STRING: + sb.append(escapeJson(BinaryVariantUtil.getString(value, pos))); + break; + case DOUBLE: + sb.append(BinaryVariantUtil.getDouble(value, pos)); + break; + case DECIMAL: + sb.append(BinaryVariantUtil.getDecimal(value, pos).toPlainString()); + break; + case DATE: + appendQuoted( + sb, + LocalDate.ofEpochDay((int) BinaryVariantUtil.getLong(value, pos)) + .toString()); + break; + case TIMESTAMP_LTZ: + appendQuoted( + sb, + TIMESTAMP_LTZ_FORMATTER.format( + microsToInstant(BinaryVariantUtil.getLong(value, pos)) + .atZone(zoneId))); + break; + case TIMESTAMP: + appendQuoted( + sb, + TIMESTAMP_FORMATTER.format( + microsToInstant(BinaryVariantUtil.getLong(value, pos)) + .atZone(ZoneOffset.UTC))); + break; + case FLOAT: + sb.append(BinaryVariantUtil.getFloat(value, pos)); + break; + case BYTES: + appendQuoted( + sb, + Base64.getEncoder() + .encodeToString(BinaryVariantUtil.getBinary(value, pos))); + break; + default: + throw unexpectedType(BinaryVariantUtil.getType(value, pos)); + } + } + + private static Instant microsToInstant(long timestamp) { + return Instant.EPOCH.plus(timestamp, ChronoUnit.MICROS); + } + + private void checkType(Type expected, Type actual) { + if (expected != actual) { + throw new VariantTypeException( + String.format("Expected type %s but got %s", expected, actual)); + } + } + + // Find the field value whose key is equal to `key`. Return null if the key is not found. + // It is only legal to call it when `getType()` is `Type.OBJECT`. + private BinaryVariant getFieldByKey(String key) { + return handleObject( + value, + pos, + (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + // Use linear search for a short list. Switch to binary search when the length + // reaches `BINARY_SEARCH_THRESHOLD`. + if (size < BINARY_SEARCH_THRESHOLD) { + for (int i = 0; i < size; ++i) { + int id = readUnsigned(value, idStart + idSize * i, idSize); + if (key.equals(getMetadataKey(metadata, id))) { + int offset = + readUnsigned( + value, offsetStart + offsetSize * i, offsetSize); + return new BinaryVariant(value, metadata, dataStart + offset); + } + } + } else { + int low = 0; + int high = size - 1; + while (low <= high) { + // Use unsigned right shift to compute the middle of `low` and `high`. + // This is not only a performance optimization, because it can properly + // handle the case where `low + high` overflows int. + int mid = (low + high) >>> 1; + int id = readUnsigned(value, idStart + idSize * mid, idSize); + int cmp = getMetadataKey(metadata, id).compareTo(key); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + int offset = + readUnsigned( + value, offsetStart + offsetSize * mid, offsetSize); + return new BinaryVariant(value, metadata, dataStart + offset); + } + } + } + return null; + }); + } + + // Get the number of array elements in the variant. + // It is only legal to call it when `getType()` is `Type.ARRAY`. + @Override + public int arraySize() { + return handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> size); + } + + // Get the array element at the `index` slot. Return null if `index` is out of the bound of + // `[0, arraySize())`. + // It is only legal to call it when `getType()` is `Type.ARRAY`. + private BinaryVariant getElementAtIndex(int index) { + return handleArray( + value, + pos, + (size, offsetSize, offsetStart, dataStart) -> { + if (index < 0 || index >= size) { + return null; + } + int offset = readUnsigned(value, offsetStart + offsetSize * index, offsetSize); + return new BinaryVariant(value, metadata, dataStart + offset); + }); + } + + // Escape a string so that it can be pasted into JSON structure. + // For example, if `str` only contains a new-line character, then the result content is "\n" + // (4 characters). + private static String escapeJson(String str) { + try (CharArrayWriter writer = new CharArrayWriter(); + JsonGenerator gen = new JsonFactory().createGenerator(writer)) { + gen.writeString(str); + gen.flush(); + return writer.toString(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private static void appendQuoted(StringBuilder sb, String str) { + sb.append('"'); + sb.append(str); + sb.append('"'); + } + + @Override + public String toString() { + return toJson(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof BinaryVariant)) { + return false; + } + BinaryVariant variant = (BinaryVariant) o; + return getPos() == variant.getPos() + && Objects.deepEquals(getValue(), variant.getValue()) + && Objects.deepEquals(getMetadata(), variant.getMetadata()); + } + + @Override + public int hashCode() { + return Objects.hash(Arrays.hashCode(value), Arrays.hashCode(metadata), pos); + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/types/variant/BinaryVariantInternalBuilder.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/types/variant/BinaryVariantInternalBuilder.java new file mode 100644 index 00000000000..b53c94bbe1f --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/types/variant/BinaryVariantInternalBuilder.java @@ -0,0 +1,657 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.types.variant; + +import org.apache.flink.cdc.common.annotation.Internal; + +import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.core.JsonFactory; +import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.core.JsonParseException; +import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.core.JsonParser; +import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.core.JsonToken; +import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.core.exc.InputCoercionException; + +import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; + +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.ARRAY; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.BASIC_TYPE_MASK; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.BINARY; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.DATE; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.DECIMAL16; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.DECIMAL4; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.DECIMAL8; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.DOUBLE; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.FALSE; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.FLOAT; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.INT1; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.INT2; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.INT4; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.INT8; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.LONG_STR; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.MAX_DECIMAL16_PRECISION; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.MAX_DECIMAL4_PRECISION; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.MAX_DECIMAL8_PRECISION; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.MAX_SHORT_STR_SIZE; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.NULL; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.OBJECT; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.SIZE_LIMIT; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.TIMESTAMP; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.TIMESTAMP_LTZ; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.TRUE; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.U16_MAX; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.U24_MAX; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.U24_SIZE; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.U32_SIZE; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.U8_MAX; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.VERSION; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.arrayHeader; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.checkIndex; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.getMetadataKey; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.handleArray; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.handleObject; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.objectHeader; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.primitiveHeader; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.readUnsigned; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.shortStrHeader; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.valueSize; +import static org.apache.flink.cdc.common.types.variant.BinaryVariantUtil.writeLong; + +/** + * Copy from BinaryVariantBuilder.java + * + *

The internal builder for {@link BinaryVariant}. + */ +@Internal +public class BinaryVariantInternalBuilder { + + public static final VariantTypeException VARIANT_SIZE_LIMIT_EXCEPTION = + new VariantTypeException("VARIANT_SIZE_LIMIT"); + public static final VariantTypeException VARIANT_DUPLICATE_KEY_EXCEPTION = + new VariantTypeException("VARIANT_DUPLICATE_KEY"); + + public BinaryVariantInternalBuilder(boolean allowDuplicateKeys) { + this.allowDuplicateKeys = allowDuplicateKeys; + } + + /** + * Parse a JSON string as a Variant value. + * + * @throws IOException if any JSON parsing error happens. + */ + public static BinaryVariant parseJson(String json, boolean allowDuplicateKeys) + throws IOException { + try (JsonParser parser = new JsonFactory().createParser(json)) { + parser.nextToken(); + return parseJson(parser, allowDuplicateKeys); + } + } + + /** + * Similar {@link #parseJson(String, boolean)}, but takes a JSON parser instead of string input. + */ + private static BinaryVariant parseJson(JsonParser parser, boolean allowDuplicateKeys) + throws IOException { + BinaryVariantInternalBuilder builder = new BinaryVariantInternalBuilder(allowDuplicateKeys); + builder.buildJson(parser); + return builder.build(); + } + + // Build the variant metadata from `dictionaryKeys` and return the variant result. + public BinaryVariant build() { + int numKeys = dictionaryKeys.size(); + // Use long to avoid overflow in accumulating lengths. + long dictionaryStringSize = 0; + for (byte[] key : dictionaryKeys) { + dictionaryStringSize += key.length; + } + // Determine the number of bytes required per offset entry. + // The largest offset is the one-past-the-end value, which is total string size. It's very + // unlikely that the number of keys could be larger, but incorporate that into the + // calculation + // in case of pathological data. + long maxSize = Math.max(dictionaryStringSize, numKeys); + if (maxSize > SIZE_LIMIT) { + throw VARIANT_SIZE_LIMIT_EXCEPTION; + } + int offsetSize = getIntegerSize((int) maxSize); + + int offsetStart = 1 + offsetSize; + int stringStart = offsetStart + (numKeys + 1) * offsetSize; + long metadataSize = stringStart + dictionaryStringSize; + + if (metadataSize > SIZE_LIMIT) { + throw VARIANT_SIZE_LIMIT_EXCEPTION; + } + byte[] metadata = new byte[(int) metadataSize]; + int headerByte = VERSION | ((offsetSize - 1) << 6); + writeLong(metadata, 0, headerByte, 1); + writeLong(metadata, 1, numKeys, offsetSize); + int currentOffset = 0; + for (int i = 0; i < numKeys; ++i) { + writeLong(metadata, offsetStart + i * offsetSize, currentOffset, offsetSize); + byte[] key = dictionaryKeys.get(i); + System.arraycopy(key, 0, metadata, stringStart + currentOffset, key.length); + currentOffset += key.length; + } + writeLong(metadata, offsetStart + numKeys * offsetSize, currentOffset, offsetSize); + return new BinaryVariant(Arrays.copyOfRange(writeBuffer, 0, writePos), metadata); + } + + public void appendString(String str) { + byte[] text = str.getBytes(StandardCharsets.UTF_8); + boolean longStr = text.length > MAX_SHORT_STR_SIZE; + checkCapacity((longStr ? 1 + U32_SIZE : 1) + text.length); + if (longStr) { + writeBuffer[writePos++] = primitiveHeader(LONG_STR); + writeLong(writeBuffer, writePos, text.length, U32_SIZE); + writePos += U32_SIZE; + } else { + writeBuffer[writePos++] = shortStrHeader(text.length); + } + System.arraycopy(text, 0, writeBuffer, writePos, text.length); + writePos += text.length; + } + + public void appendNull() { + checkCapacity(1); + writeBuffer[writePos++] = primitiveHeader(NULL); + } + + public void appendBoolean(boolean b) { + checkCapacity(1); + writeBuffer[writePos++] = primitiveHeader(b ? TRUE : FALSE); + } + + public void appendByte(byte b) { + checkCapacity(1 + 1); + writeBuffer[writePos++] = primitiveHeader(INT1); + writeLong(writeBuffer, writePos, b, 1); + writePos += 1; + } + + public void appendShort(short s) { + checkCapacity(1 + 2); + writeBuffer[writePos++] = primitiveHeader(INT2); + writeLong(writeBuffer, writePos, s, 2); + writePos += 2; + } + + public void appendInt(int i) { + checkCapacity(1 + 4); + writeBuffer[writePos++] = primitiveHeader(INT4); + writeLong(writeBuffer, writePos, i, 4); + writePos += 4; + } + + public void appendLong(long l) { + checkCapacity(1 + 8); + writeBuffer[writePos++] = primitiveHeader(INT8); + writeLong(writeBuffer, writePos, l, 8); + writePos += 8; + } + + public void appendNumeric(long l) { + if (l == (byte) l) { + appendByte((byte) l); + } else if (l == (short) l) { + appendShort((short) l); + } else if (l == (int) l) { + appendInt((int) l); + } else { + appendLong(l); + } + } + + public void appendDouble(double d) { + checkCapacity(1 + 8); + writeBuffer[writePos++] = primitiveHeader(DOUBLE); + writeLong(writeBuffer, writePos, Double.doubleToLongBits(d), 8); + writePos += 8; + } + + // Append a decimal value to the variant builder. The caller should guarantee that its precision + // and scale fit into `MAX_DECIMAL16_PRECISION`. + public void appendDecimal(BigDecimal d) { + checkCapacity(2 + 16); + BigInteger unscaled = d.unscaledValue(); + if (d.scale() <= MAX_DECIMAL4_PRECISION && d.precision() <= MAX_DECIMAL4_PRECISION) { + writeBuffer[writePos++] = primitiveHeader(DECIMAL4); + writeBuffer[writePos++] = (byte) d.scale(); + writeLong(writeBuffer, writePos, unscaled.intValueExact(), 4); + writePos += 4; + } else if (d.scale() <= MAX_DECIMAL8_PRECISION && d.precision() <= MAX_DECIMAL8_PRECISION) { + writeBuffer[writePos++] = primitiveHeader(DECIMAL8); + writeBuffer[writePos++] = (byte) d.scale(); + writeLong(writeBuffer, writePos, unscaled.longValueExact(), 8); + writePos += 8; + } else { + assert d.scale() <= MAX_DECIMAL16_PRECISION && d.precision() <= MAX_DECIMAL16_PRECISION; + writeBuffer[writePos++] = primitiveHeader(DECIMAL16); + writeBuffer[writePos++] = (byte) d.scale(); + // `toByteArray` returns a big-endian representation. We need to copy it reversely and + // sign + // extend it to 16 bytes. + byte[] bytes = unscaled.toByteArray(); + for (int i = 0; i < bytes.length; ++i) { + writeBuffer[writePos + i] = bytes[bytes.length - 1 - i]; + } + byte sign = (byte) (bytes[0] < 0 ? -1 : 0); + for (int i = bytes.length; i < 16; ++i) { + writeBuffer[writePos + i] = sign; + } + writePos += 16; + } + } + + public void appendDate(int daysSinceEpoch) { + checkCapacity(1 + 4); + writeBuffer[writePos++] = primitiveHeader(DATE); + writeLong(writeBuffer, writePos, daysSinceEpoch, 4); + writePos += 4; + } + + public void appendTimestampLtz(long microsSinceEpoch) { + checkCapacity(1 + 8); + writeBuffer[writePos++] = primitiveHeader(TIMESTAMP_LTZ); + writeLong(writeBuffer, writePos, microsSinceEpoch, 8); + writePos += 8; + } + + public void appendTimestamp(long microsSinceEpoch) { + checkCapacity(1 + 8); + writeBuffer[writePos++] = primitiveHeader(TIMESTAMP); + writeLong(writeBuffer, writePos, microsSinceEpoch, 8); + writePos += 8; + } + + public void appendFloat(float f) { + checkCapacity(1 + 4); + writeBuffer[writePos++] = primitiveHeader(FLOAT); + writeLong(writeBuffer, writePos, Float.floatToIntBits(f), 8); + writePos += 4; + } + + public void appendBinary(byte[] binary) { + checkCapacity(1 + U32_SIZE + binary.length); + writeBuffer[writePos++] = primitiveHeader(BINARY); + writeLong(writeBuffer, writePos, binary.length, U32_SIZE); + writePos += U32_SIZE; + System.arraycopy(binary, 0, writeBuffer, writePos, binary.length); + writePos += binary.length; + } + + // Add a key to the variant dictionary. If the key already exists, the dictionary is not + // modified. + // In either case, return the id of the key. + public int addKey(String key) { + int id; + if (dictionary.containsKey(key)) { + id = dictionary.get(key); + } else { + id = dictionaryKeys.size(); + dictionary.put(key, id); + dictionaryKeys.add(key.getBytes(StandardCharsets.UTF_8)); + } + return id; + } + + // Return the current write position of the variant builder. It is used together with + // `finishWritingObject` or `finishWritingArray`. + public int getWritePos() { + return writePos; + } + + // Finish writing a variant object after all of its fields have already been written. The + // process + // is as follows: + // 1. The caller calls `getWritePos` before writing any fields to obtain the `start` parameter. + // 2. The caller appends all the object fields to the builder. In the meantime, it should + // maintain + // the `fields` parameter. Before appending each field, it should append an entry to `fields` to + // record the offset of the field. The offset is computed as `getWritePos() - start`. + // 3. The caller calls `finishWritingObject` to finish writing a variant object. + // + // This function is responsible to sort the fields by key. If there are duplicate field keys: + // - when `allowDuplicateKeys` is true, the field with the greatest offset value (the last + // appended one) is kept. + // - otherwise, throw an exception. + public void finishWritingObject(int start, ArrayList fields) { + int size = fields.size(); + Collections.sort(fields); + int maxId = size == 0 ? 0 : fields.get(0).id; + if (allowDuplicateKeys) { + int distinctPos = 0; + // Maintain a list of distinct keys in-place. + for (int i = 1; i < size; ++i) { + maxId = Math.max(maxId, fields.get(i).id); + if (fields.get(i).id == fields.get(i - 1).id) { + // Found a duplicate key. Keep the field with a greater offset. + if (fields.get(distinctPos).offset < fields.get(i).offset) { + fields.set( + distinctPos, + fields.get(distinctPos).withNewOffset(fields.get(i).offset)); + } + } else { + // Found a distinct key. Add the field to the list. + ++distinctPos; + fields.set(distinctPos, fields.get(i)); + } + } + if (distinctPos + 1 < fields.size()) { + size = distinctPos + 1; + // Resize `fields` to `size`. + fields.subList(size, fields.size()).clear(); + // Sort the fields by offsets so that we can move the value data of each field to + // the new + // offset without overwriting the fields after it. + fields.sort(Comparator.comparingInt(f -> f.offset)); + int currentOffset = 0; + for (int i = 0; i < size; ++i) { + int oldOffset = fields.get(i).offset; + int fieldSize = valueSize(writeBuffer, start + oldOffset); + System.arraycopy( + writeBuffer, + start + oldOffset, + writeBuffer, + start + currentOffset, + fieldSize); + fields.set(i, fields.get(i).withNewOffset(currentOffset)); + currentOffset += fieldSize; + } + writePos = start + currentOffset; + // Change back to the sort order by field keys to meet the variant spec. + Collections.sort(fields); + } + } else { + for (int i = 1; i < size; ++i) { + maxId = Math.max(maxId, fields.get(i).id); + String key = fields.get(i).key; + if (key.equals(fields.get(i - 1).key)) { + throw VARIANT_DUPLICATE_KEY_EXCEPTION; + } + } + } + int dataSize = writePos - start; + boolean largeSize = size > U8_MAX; + int sizeBytes = largeSize ? U32_SIZE : 1; + int idSize = getIntegerSize(maxId); + int offsetSize = getIntegerSize(dataSize); + // The space for header byte, object size, id list, and offset list. + int headerSize = 1 + sizeBytes + size * idSize + (size + 1) * offsetSize; + checkCapacity(headerSize); + // Shift the just-written field data to make room for the object header section. + System.arraycopy(writeBuffer, start, writeBuffer, start + headerSize, dataSize); + writePos += headerSize; + writeBuffer[start] = objectHeader(largeSize, idSize, offsetSize); + writeLong(writeBuffer, start + 1, size, sizeBytes); + int idStart = start + 1 + sizeBytes; + int offsetStart = idStart + size * idSize; + for (int i = 0; i < size; ++i) { + writeLong(writeBuffer, idStart + i * idSize, fields.get(i).id, idSize); + writeLong(writeBuffer, offsetStart + i * offsetSize, fields.get(i).offset, offsetSize); + } + writeLong(writeBuffer, offsetStart + size * offsetSize, dataSize, offsetSize); + } + + // Finish writing a variant array after all of its elements have already been written. The + // process + // is similar to that of `finishWritingObject`. + public void finishWritingArray(int start, ArrayList offsets) { + int dataSize = writePos - start; + int size = offsets.size(); + boolean largeSize = size > U8_MAX; + int sizeBytes = largeSize ? U32_SIZE : 1; + int offsetSize = getIntegerSize(dataSize); + // The space for header byte, object size, and offset list. + int headerSize = 1 + sizeBytes + (size + 1) * offsetSize; + checkCapacity(headerSize); + // Shift the just-written field data to make room for the header section. + System.arraycopy(writeBuffer, start, writeBuffer, start + headerSize, dataSize); + writePos += headerSize; + writeBuffer[start] = arrayHeader(largeSize, offsetSize); + writeLong(writeBuffer, start + 1, size, sizeBytes); + int offsetStart = start + 1 + sizeBytes; + for (int i = 0; i < size; ++i) { + writeLong(writeBuffer, offsetStart + i * offsetSize, offsets.get(i), offsetSize); + } + writeLong(writeBuffer, offsetStart + size * offsetSize, dataSize, offsetSize); + } + + // Append a variant value to the variant builder. We need to insert the keys in the input + // variant + // into the current variant dictionary and rebuild it with new field ids. For scalar values in + // the + // input variant, we can directly copy the binary slice. + public void appendVariant(BinaryVariant v) { + appendVariantImpl(v.getValue(), v.getMetadata(), v.getPos()); + } + + private void appendVariantImpl(byte[] value, byte[] metadata, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + switch (basicType) { + case OBJECT: + handleObject( + value, + pos, + (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + ArrayList fields = new ArrayList<>(size); + int start = writePos; + for (int i = 0; i < size; ++i) { + int id = readUnsigned(value, idStart + idSize * i, idSize); + int offset = + readUnsigned( + value, offsetStart + offsetSize * i, offsetSize); + int elementPos = dataStart + offset; + String key = getMetadataKey(metadata, id); + int newId = addKey(key); + fields.add(new FieldEntry(key, newId, writePos - start)); + appendVariantImpl(value, metadata, elementPos); + } + finishWritingObject(start, fields); + return null; + }); + break; + case ARRAY: + handleArray( + value, + pos, + (size, offsetSize, offsetStart, dataStart) -> { + ArrayList offsets = new ArrayList<>(size); + int start = writePos; + for (int i = 0; i < size; ++i) { + int offset = + readUnsigned( + value, offsetStart + offsetSize * i, offsetSize); + int elementPos = dataStart + offset; + offsets.add(writePos - start); + appendVariantImpl(value, metadata, elementPos); + } + finishWritingArray(start, offsets); + return null; + }); + break; + default: + shallowAppendVariantImpl(value, pos); + break; + } + } + + private void shallowAppendVariantImpl(byte[] value, int pos) { + int size = valueSize(value, pos); + checkIndex(pos + size - 1, value.length); + checkCapacity(size); + System.arraycopy(value, pos, writeBuffer, writePos, size); + writePos += size; + } + + private void checkCapacity(int additional) { + int required = writePos + additional; + if (required > writeBuffer.length) { + // Allocate a new buffer with a capacity of the next power of 2 of `required`. + int newCapacity = Integer.highestOneBit(required); + newCapacity = newCapacity < required ? newCapacity * 2 : newCapacity; + if (newCapacity > SIZE_LIMIT) { + throw VARIANT_SIZE_LIMIT_EXCEPTION; + } + byte[] newValue = new byte[newCapacity]; + System.arraycopy(writeBuffer, 0, newValue, 0, writePos); + writeBuffer = newValue; + } + } + + /** + * Temporarily store the information of a field. We need to collect all fields in an JSON + * object, sort them by their keys, and build the variant object in sorted order. + */ + public static final class FieldEntry implements Comparable { + final String key; + final int id; + final int offset; + + public FieldEntry(String key, int id, int offset) { + this.key = key; + this.id = id; + this.offset = offset; + } + + FieldEntry withNewOffset(int newOffset) { + return new FieldEntry(key, id, newOffset); + } + + @Override + public int compareTo(FieldEntry other) { + return key.compareTo(other.key); + } + } + + private void buildJson(JsonParser parser) throws IOException { + JsonToken token = parser.currentToken(); + if (token == null) { + throw new JsonParseException(parser, "Unexpected null token"); + } + switch (token) { + case START_OBJECT: + { + ArrayList fields = new ArrayList<>(); + int start = writePos; + while (parser.nextToken() != JsonToken.END_OBJECT) { + String key = parser.currentName(); + parser.nextToken(); + int id = addKey(key); + fields.add(new FieldEntry(key, id, writePos - start)); + buildJson(parser); + } + finishWritingObject(start, fields); + break; + } + case START_ARRAY: + { + ArrayList offsets = new ArrayList<>(); + int start = writePos; + while (parser.nextToken() != JsonToken.END_ARRAY) { + offsets.add(writePos - start); + buildJson(parser); + } + finishWritingArray(start, offsets); + break; + } + case VALUE_STRING: + appendString(parser.getText()); + break; + case VALUE_NUMBER_INT: + try { + appendNumeric(parser.getLongValue()); + } catch (InputCoercionException ignored) { + // If the value doesn't fit any integer type, parse it as decimal or floating + // instead. + parseFloatingPoint(parser); + } + break; + case VALUE_NUMBER_FLOAT: + parseFloatingPoint(parser); + break; + case VALUE_TRUE: + appendBoolean(true); + break; + case VALUE_FALSE: + appendBoolean(false); + break; + case VALUE_NULL: + appendNull(); + break; + default: + throw new JsonParseException(parser, "Unexpected token " + token); + } + } + + // Choose the smallest unsigned integer type that can store `value`. It must be within + // `[0, U24_MAX]`. + private int getIntegerSize(int value) { + assert value >= 0 && value <= U24_MAX; + if (value <= U8_MAX) { + return 1; + } + if (value <= U16_MAX) { + return 2; + } + return U24_SIZE; + } + + private void parseFloatingPoint(JsonParser parser) throws IOException { + if (!tryParseDecimal(parser.getText())) { + appendDouble(parser.getDoubleValue()); + } + } + + // Try to parse a JSON number as a decimal. Return whether the parsing succeeds. The input must + // only use the decimal format (an integer value with an optional '.' in it) and must not use + // scientific notation. It also must fit into the precision limitation of decimal types. + private boolean tryParseDecimal(String input) { + for (int i = 0; i < input.length(); ++i) { + char ch = input.charAt(i); + if (ch != '-' && ch != '.' && !(ch >= '0' && ch <= '9')) { + return false; + } + } + BigDecimal d = new BigDecimal(input); + if (d.scale() <= MAX_DECIMAL16_PRECISION && d.precision() <= MAX_DECIMAL16_PRECISION) { + appendDecimal(d); + return true; + } + return false; + } + + // The write buffer in building the variant value. Its first `writePos` bytes has been written. + private byte[] writeBuffer = new byte[128]; + private int writePos = 0; + // Map keys to a monotonically increasing id. + private final HashMap dictionary = new HashMap<>(); + // Store all keys in `dictionary` in the order of id. + private final ArrayList dictionaryKeys = new ArrayList<>(); + private final boolean allowDuplicateKeys; +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/utils/InstantiationUtil.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/utils/InstantiationUtil.java new file mode 100644 index 00000000000..09e2285c9a4 --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/utils/InstantiationUtil.java @@ -0,0 +1,263 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.utils; + +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.core.memory.DataInputViewStreamWrapper; +import org.apache.flink.core.memory.DataOutputViewStreamWrapper; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.ObjectStreamClass; +import java.io.OutputStream; +import java.io.Serializable; +import java.lang.reflect.Modifier; +import java.lang.reflect.Proxy; +import java.util.HashMap; +import java.util.zip.DeflaterOutputStream; + +/** Utility class to create instances from class objects. */ +public class InstantiationUtil { + private InstantiationUtil() { + // no instantiation + } + + /** A custom ObjectInputStream that can load classes using a specific ClassLoader. */ + public static class ClassLoaderObjectInputStream extends ObjectInputStream { + + protected final ClassLoader classLoader; + + public ClassLoaderObjectInputStream(InputStream in, ClassLoader classLoader) + throws IOException { + super(in); + this.classLoader = classLoader; + } + + @Override + protected Class resolveClass(ObjectStreamClass desc) + throws IOException, ClassNotFoundException { + if (classLoader != null) { + String name = desc.getName(); + try { + return Class.forName(name, false, classLoader); + } catch (ClassNotFoundException ex) { + // check if class is a primitive class + Class cl = primitiveClasses.get(name); + if (cl != null) { + // return primitive class + return cl; + } else { + // throw ClassNotFoundException + throw ex; + } + } + } + + return super.resolveClass(desc); + } + + @Override + protected Class resolveProxyClass(String[] interfaces) + throws IOException, ClassNotFoundException { + if (classLoader != null) { + ClassLoader nonPublicLoader = null; + boolean hasNonPublicInterface = false; + + // define proxy in class loader of non-public interface(s), if any + Class[] classObjs = new Class[interfaces.length]; + for (int i = 0; i < interfaces.length; i++) { + Class cl = Class.forName(interfaces[i], false, classLoader); + if ((cl.getModifiers() & Modifier.PUBLIC) == 0) { + if (hasNonPublicInterface) { + if (nonPublicLoader != cl.getClassLoader()) { + throw new IllegalAccessError( + "conflicting non-public interface class loaders"); + } + } else { + nonPublicLoader = cl.getClassLoader(); + hasNonPublicInterface = true; + } + } + classObjs[i] = cl; + } + try { + return Proxy.getProxyClass( + hasNonPublicInterface ? nonPublicLoader : classLoader, classObjs); + } catch (IllegalArgumentException e) { + throw new ClassNotFoundException(null, e); + } + } + + return super.resolveProxyClass(interfaces); + } + + // ------------------------------------------------ + + private static final HashMap> primitiveClasses = new HashMap<>(9); + + static { + primitiveClasses.put("boolean", boolean.class); + primitiveClasses.put("byte", byte.class); + primitiveClasses.put("char", char.class); + primitiveClasses.put("short", short.class); + primitiveClasses.put("int", int.class); + primitiveClasses.put("long", long.class); + primitiveClasses.put("float", float.class); + primitiveClasses.put("double", double.class); + primitiveClasses.put("void", void.class); + } + } + + public static byte[] serializeObject(Object o) throws IOException { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + ObjectOutputStream oos = new ObjectOutputStream(baos)) { + oos.writeObject(o); + oos.flush(); + return baos.toByteArray(); + } + } + + public static void serializeObject(OutputStream out, Object o) throws IOException { + ObjectOutputStream oos = + out instanceof ObjectOutputStream + ? (ObjectOutputStream) out + : new ObjectOutputStream(out); + oos.writeObject(o); + } + + public static byte[] serializeObjectAndCompress(Object o) throws IOException { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DeflaterOutputStream dos = new DeflaterOutputStream(baos); + ObjectOutputStream oos = new ObjectOutputStream(dos)) { + oos.writeObject(o); + oos.flush(); + dos.close(); + return baos.toByteArray(); + } + } + + public static boolean isSerializable(Object o) { + try { + serializeObject(o); + } catch (IOException e) { + return false; + } + + return true; + } + + public static byte[] serializeToByteArray(TypeSerializer serializer, T record) + throws IOException { + if (record == null) { + throw new NullPointerException("Record to serialize to byte array must not be null."); + } + + ByteArrayOutputStream bos = new ByteArrayOutputStream(64); + DataOutputViewStreamWrapper outputViewWrapper = new DataOutputViewStreamWrapper(bos); + serializer.serialize(record, outputViewWrapper); + return bos.toByteArray(); + } + + public static T deserializeFromByteArray(TypeSerializer serializer, byte[] buf) + throws IOException { + if (buf == null) { + throw new NullPointerException("Byte array to deserialize from must not be null."); + } + + DataInputViewStreamWrapper inputViewWrapper = + new DataInputViewStreamWrapper(new ByteArrayInputStream(buf)); + return serializer.deserialize(inputViewWrapper); + } + + public static T deserializeFromByteArray(TypeSerializer serializer, T reuse, byte[] buf) + throws IOException { + if (buf == null) { + throw new NullPointerException("Byte array to deserialize from must not be null."); + } + + DataInputViewStreamWrapper inputViewWrapper = + new DataInputViewStreamWrapper(new ByteArrayInputStream(buf)); + return serializer.deserialize(reuse, inputViewWrapper); + } + + @SuppressWarnings("unchecked") + public static T deserializeObject(byte[] bytes, ClassLoader cl) + throws IOException, ClassNotFoundException { + return deserializeObject(new ByteArrayInputStream(bytes), cl); + } + + @SuppressWarnings("unchecked") + public static T deserializeObject(InputStream in, ClassLoader cl) + throws IOException, ClassNotFoundException { + + final ClassLoader old = Thread.currentThread().getContextClassLoader(); + // not using resource try to avoid AutoClosable's close() on the given stream + try { + ObjectInputStream oois = new ClassLoaderObjectInputStream(in, cl); + Thread.currentThread().setContextClassLoader(cl); + return (T) oois.readObject(); + } finally { + Thread.currentThread().setContextClassLoader(old); + } + } + + /** + * Clones the given serializable object using Java serialization. + * + * @param obj Object to clone + * @param Type of the object to clone + * @return The cloned object + * @throws IOException Thrown if the serialization or deserialization process fails. + * @throws ClassNotFoundException Thrown if any of the classes referenced by the object cannot + * be resolved during deserialization. + */ + public static T clone(T obj) + throws IOException, ClassNotFoundException { + if (obj == null) { + return null; + } else { + return clone(obj, obj.getClass().getClassLoader()); + } + } + + /** + * Clones the given serializable object using Java serialization, using the given classloader to + * resolve the cloned classes. + * + * @param obj Object to clone + * @param classLoader The classloader to resolve the classes during deserialization. + * @param Type of the object to clone + * @return Cloned object + * @throws IOException Thrown if the serialization or deserialization process fails. + * @throws ClassNotFoundException Thrown if any of the classes referenced by the object cannot + * be resolved during deserialization. + */ + public static T clone(T obj, ClassLoader classLoader) + throws IOException, ClassNotFoundException { + if (obj == null) { + return null; + } else { + final byte[] serializedObject = serializeObject(obj); + return deserializeObject(serializedObject, classLoader); + } + } +} diff --git a/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/utils/SchemaMergingUtils.java b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/utils/SchemaMergingUtils.java new file mode 100644 index 00000000000..665aef5f509 --- /dev/null +++ b/flink-cdc-common-2.x/src/main/java/org/apache/flink/cdc/common/utils/SchemaMergingUtils.java @@ -0,0 +1,1058 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.utils; + +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.cdc.common.annotation.PublicEvolving; +import org.apache.flink.cdc.common.annotation.VisibleForTesting; +import org.apache.flink.cdc.common.data.DateData; +import org.apache.flink.cdc.common.data.DecimalData; +import org.apache.flink.cdc.common.data.LocalZonedTimestampData; +import org.apache.flink.cdc.common.data.StringData; +import org.apache.flink.cdc.common.data.TimeData; +import org.apache.flink.cdc.common.data.TimestampData; +import org.apache.flink.cdc.common.data.ZonedTimestampData; +import org.apache.flink.cdc.common.data.binary.BinaryStringData; +import org.apache.flink.cdc.common.event.AddColumnEvent; +import org.apache.flink.cdc.common.event.AlterColumnTypeEvent; +import org.apache.flink.cdc.common.event.CreateTableEvent; +import org.apache.flink.cdc.common.event.DropColumnEvent; +import org.apache.flink.cdc.common.event.SchemaChangeEvent; +import org.apache.flink.cdc.common.event.TableId; +import org.apache.flink.cdc.common.schema.Column; +import org.apache.flink.cdc.common.schema.Schema; +import org.apache.flink.cdc.common.types.ArrayType; +import org.apache.flink.cdc.common.types.BigIntType; +import org.apache.flink.cdc.common.types.BinaryType; +import org.apache.flink.cdc.common.types.BooleanType; +import org.apache.flink.cdc.common.types.CharType; +import org.apache.flink.cdc.common.types.DataType; +import org.apache.flink.cdc.common.types.DataTypeFamily; +import org.apache.flink.cdc.common.types.DataTypeRoot; +import org.apache.flink.cdc.common.types.DataTypes; +import org.apache.flink.cdc.common.types.DateType; +import org.apache.flink.cdc.common.types.DecimalType; +import org.apache.flink.cdc.common.types.DoubleType; +import org.apache.flink.cdc.common.types.FloatType; +import org.apache.flink.cdc.common.types.IntType; +import org.apache.flink.cdc.common.types.LocalZonedTimestampType; +import org.apache.flink.cdc.common.types.MapType; +import org.apache.flink.cdc.common.types.RowType; +import org.apache.flink.cdc.common.types.SmallIntType; +import org.apache.flink.cdc.common.types.TimeType; +import org.apache.flink.cdc.common.types.TimestampType; +import org.apache.flink.cdc.common.types.TinyIntType; +import org.apache.flink.cdc.common.types.VarBinaryType; +import org.apache.flink.cdc.common.types.VarCharType; +import org.apache.flink.cdc.common.types.VariantType; +import org.apache.flink.cdc.common.types.ZonedTimestampType; +import org.apache.flink.cdc.common.types.variant.Variant; + +import org.apache.flink.shaded.guava33.com.google.common.collect.ArrayListMultimap; +import org.apache.flink.shaded.guava33.com.google.common.collect.ImmutableList; +import org.apache.flink.shaded.guava33.com.google.common.collect.Streams; +import org.apache.flink.shaded.guava33.com.google.common.io.BaseEncoding; + +import javax.annotation.Nullable; + +import java.math.BigDecimal; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.ZoneId; +import java.time.ZoneOffset; +import java.time.ZonedDateTime; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + +/** + * Utils for merging {@link Schema}s and {@link DataType}s. Prefer using this over {@link + * SchemaUtils} to get consistent schema merging behaviors. + */ +@PublicEvolving +public class SchemaMergingUtils { + /** + * Checking if given {@code upcomingSchema} could be fit into currently known {@code + * currentSchema}. Current schema could be null (as the cold opening state, and in this case it + * always returns {@code false}) but the upcoming schema should never be null.
+ * This method only checks columns' type compatibility, but ignores metadata fields like + * primaryKeys, partitionKeys, options. + */ + public static boolean isSchemaCompatible( + @Nullable Schema currentSchema, Schema upcomingSchema) { + if (currentSchema == null) { + return false; + } + Map currentColumnTypes = + currentSchema.getColumns().stream() + .collect(Collectors.toMap(Column::getName, Column::getType)); + List upcomingColumns = upcomingSchema.getColumns(); + + for (Column upcomingColumn : upcomingColumns) { + String columnName = upcomingColumn.getName(); + DataType upcomingColumnType = upcomingColumn.getType(); + DataType currentColumnType = currentColumnTypes.get(columnName); + + if (!isDataTypeCompatible(currentColumnType, upcomingColumnType)) { + return false; + } + } + return true; + } + + /** + * Try to merge {@code upcomingSchema} into {@code currentSchema} by performing lenient schema + * changes. Returns a wider schema that could both of them. + */ + public static Schema getLeastCommonSchema( + @Nullable Schema currentSchema, Schema upcomingSchema) { + // No current schema record, we need to create it first. + if (currentSchema == null) { + return upcomingSchema; + } + + // Current schema is compatible with upcoming ones, just return it and perform no schema + // evolution. + if (isSchemaCompatible(currentSchema, upcomingSchema)) { + return currentSchema; + } + + Map newTypeMapping = new HashMap<>(); + + Map currentColumns = + currentSchema.getColumns().stream() + .collect(Collectors.toMap(Column::getName, col -> col)); + List upcomingColumns = upcomingSchema.getColumns(); + + List appendedColumns = new ArrayList<>(); + + for (Column upcomingColumn : upcomingColumns) { + String columnName = upcomingColumn.getName(); + DataType upcomingColumnType = upcomingColumn.getType(); + if (currentColumns.containsKey(columnName)) { + Column currentColumn = currentColumns.get(columnName); + DataType currentColumnType = currentColumn.getType(); + DataType leastCommonType = + getLeastCommonType(currentColumnType, upcomingColumnType); + if (!Objects.equals(leastCommonType, currentColumnType)) { + newTypeMapping.put(columnName, leastCommonType); + } + } else { + appendedColumns.add(upcomingColumn); + } + } + + List commonColumns = new ArrayList<>(); + for (Column column : currentSchema.getColumns()) { + if (newTypeMapping.containsKey(column.getName())) { + commonColumns.add(column.copy(newTypeMapping.get(column.getName()))); + } else { + commonColumns.add(column); + } + } + + commonColumns.addAll(appendedColumns); + return currentSchema.copy(commonColumns); + } + + /** Merge compatible schemas. */ + public static Schema getCommonSchema(List schemas) { + if (schemas.isEmpty()) { + return null; + } else if (schemas.size() == 1) { + return schemas.get(0); + } else { + Schema outputSchema = null; + for (Schema schema : schemas) { + outputSchema = getLeastCommonSchema(outputSchema, schema); + } + return outputSchema; + } + } + + /** + * Generating what schema change events we need to do by converting compatible {@code + * beforeSchema} to {@code afterSchema}. + */ + public static List getSchemaDifference( + TableId tableId, @Nullable Schema beforeSchema, Schema afterSchema) { + if (beforeSchema == null) { + return Collections.singletonList(new CreateTableEvent(tableId, afterSchema)); + } + + Map beforeColumns = + beforeSchema.getColumns().stream() + .collect(Collectors.toMap(Column::getName, col -> col)); + + Map oldTypeMapping = new HashMap<>(); + Map newTypeMapping = new HashMap<>(); + List appendedColumns = new ArrayList<>(); + + String afterWhichColumnPosition = null; + for (Column afterColumn : afterSchema.getColumns()) { + String columnName = afterColumn.getName(); + DataType afterType = afterColumn.getType(); + if (beforeColumns.containsKey(columnName)) { + DataType beforeType = beforeColumns.get(columnName).getType(); + if (!Objects.equals(beforeType, afterType)) { + oldTypeMapping.put(columnName, beforeType); + newTypeMapping.put(columnName, afterType); + } + beforeColumns.remove(columnName); + } else { + if (afterWhichColumnPosition == null) { + appendedColumns.add( + new AddColumnEvent.ColumnWithPosition( + afterColumn, AddColumnEvent.ColumnPosition.FIRST, null)); + } else { + appendedColumns.add( + new AddColumnEvent.ColumnWithPosition( + afterColumn, + AddColumnEvent.ColumnPosition.AFTER, + afterWhichColumnPosition)); + } + } + afterWhichColumnPosition = afterColumn.getName(); + } + + List schemaChangeEvents = new ArrayList<>(); + if (!appendedColumns.isEmpty()) { + schemaChangeEvents.add(new AddColumnEvent(tableId, appendedColumns)); + } + + if (!newTypeMapping.isEmpty()) { + schemaChangeEvents.add( + new AlterColumnTypeEvent(tableId, newTypeMapping, oldTypeMapping)); + } + + if (!beforeColumns.isEmpty()) { + schemaChangeEvents.add( + new DropColumnEvent(tableId, new ArrayList<>(beforeColumns.keySet()))); + } + return schemaChangeEvents; + } + + /** + * Coercing {@code upcomingRow} with {@code upcomingTypes} schema into {@code currentTypes} + * schema. Invoking this method implicitly assumes that {@code isSchemaCompatible(currentSchema, + * upcomingSchema)} returns true. Otherwise, some upstream records might be lost. + */ + public static Object[] coerceRow( + String timezone, + Schema currentSchema, + Schema upcomingSchema, + List upcomingRow) { + return coerceRow(timezone, currentSchema, upcomingSchema, upcomingRow, true); + } + + /** + * Coercing {@code upcomingRow} with {@code upcomingTypes} schema into {@code currentTypes} + * schema. Invoking this method implicitly assumes that {@code isSchemaCompatible(currentSchema, + * upcomingSchema)} returns true. Otherwise, some upstream records might be lost. + */ + public static Object[] coerceRow( + String timezone, + Schema currentSchema, + Schema upcomingSchema, + List upcomingRow, + boolean toleranceMode) { + List currentColumns = currentSchema.getColumns(); + Map upcomingColumnTypes = + upcomingSchema.getColumns().stream() + .collect(Collectors.toMap(Column::getName, Column::getType)); + Map upcomingColumnObjects = + Streams.zip( + upcomingSchema.getColumnNames().stream(), + upcomingRow.stream(), + Tuple2::of) + .filter(t -> t.f1 != null) + .collect(Collectors.toMap(t -> t.f0, t -> t.f1)); + Object[] coercedRow = new Object[currentSchema.getColumnCount()]; + + for (int i = 0; i < currentSchema.getColumnCount(); i++) { + Column currentColumn = currentColumns.get(i); + String columnName = currentColumn.getName(); + if (upcomingColumnTypes.containsKey(columnName)) { + + DataType upcomingType = upcomingColumnTypes.get(columnName); + DataType currentType = currentColumn.getType(); + + if (Objects.equals(upcomingType, currentType)) { + coercedRow[i] = upcomingColumnObjects.get(columnName); + } else { + try { + coercedRow[i] = + coerceObject( + timezone, + upcomingColumnObjects.get(columnName), + upcomingColumnTypes.get(columnName), + currentColumn.getType()); + } catch (IllegalArgumentException e) { + if (!toleranceMode) { + throw e; + } + } + } + } else { + coercedRow[i] = null; + } + } + return coercedRow; + } + + /** + * Try to merge given {@link Schema}s and ensure they're identical. The only difference allowed + * is nullability, string and varchar precision, default value, and comments. + */ + public static Schema strictlyMergeSchemas(List schemas) { + Preconditions.checkArgument( + !schemas.isEmpty(), "Trying to merge transformed schemas %s, but got empty list"); + if (schemas.size() == 1) { + return schemas.get(0); + } + + List> primaryKeys = + schemas.stream() + .map(Schema::primaryKeys) + .filter(p -> !p.isEmpty()) + .distinct() + .collect(Collectors.toList()); + List> partitionKeys = + schemas.stream() + .map(Schema::partitionKeys) + .filter(p -> !p.isEmpty()) + .distinct() + .collect(Collectors.toList()); + List> options = + schemas.stream() + .map(Schema::options) + .filter(p -> !p.isEmpty()) + .distinct() + .collect(Collectors.toList()); + List> columnNames = + schemas.stream() + .map(Schema::getColumnNames) + .distinct() + .collect(Collectors.toList()); + + Preconditions.checkArgument( + primaryKeys.size() <= 1, + "Trying to merge transformed schemas %s, but got more than one primary key configurations: %s", + schemas, + primaryKeys); + Preconditions.checkArgument( + partitionKeys.size() <= 1, + "Trying to merge transformed schemas %s, but got more than one partition key configurations: %s", + schemas, + partitionKeys); + Preconditions.checkArgument( + options.size() <= 1, + "Trying to merge transformed schemas %s, but got more than one option configurations: %s", + schemas, + options); + Preconditions.checkArgument( + columnNames.size() == 1, + "Trying to merge transformed schemas %s, but got more than one column name views: %s", + schemas, + columnNames); + + int arity = columnNames.get(0).size(); + + ArrayListMultimap toBeMergedColumnTypes = + ArrayListMultimap.create(arity, 1); + for (Schema schema : schemas) { + List columnTypes = schema.getColumnDataTypes(); + for (int colIndex = 0; colIndex < columnTypes.size(); colIndex++) { + toBeMergedColumnTypes.put(colIndex, columnTypes.get(colIndex)); + } + } + + List mergedColumnNames = columnNames.iterator().next(); + List mergedColumnTypes = new ArrayList<>(arity); + for (int i = 0; i < arity; i++) { + mergedColumnTypes.add(strictlyMergeDataTypes(toBeMergedColumnTypes.get(i))); + } + + List mergedColumns = new ArrayList<>(); + for (int i = 0; i < mergedColumnNames.size(); i++) { + mergedColumns.add( + Column.physicalColumn(mergedColumnNames.get(i), mergedColumnTypes.get(i))); + } + + return Schema.newBuilder() + .primaryKey(primaryKeys.isEmpty() ? Collections.emptyList() : primaryKeys.get(0)) + .partitionKey( + partitionKeys.isEmpty() ? Collections.emptyList() : partitionKeys.get(0)) + .options(options.isEmpty() ? Collections.emptyMap() : options.get(0)) + .setColumns(mergedColumns) + .build(); + } + + private static DataType strictlyMergeDataTypes(List dataTypes) { + Preconditions.checkArgument( + !dataTypes.isEmpty(), + "Trying to merge transformed data types %s, but got empty list"); + + List simpleMergeTypes = + dataTypes.stream().distinct().collect(Collectors.toList()); + if (simpleMergeTypes.size() == 1) { + return simpleMergeTypes.get(0); + } + + List typeRoots = + dataTypes.stream() + .map(DataType::getTypeRoot) + .distinct() + .collect(Collectors.toList()); + Preconditions.checkArgument( + typeRoots.size() == 1, + "Trying to merge types %s, but got more than one type root: %s", + dataTypes, + typeRoots); + + // Decay types to the most + DataType type = dataTypes.get(0); + + if (type.is(DataTypeRoot.CHAR)) { + return DataTypes.CHAR(CharType.MAX_LENGTH); + } else if (type.is(DataTypeRoot.VARCHAR)) { + return DataTypes.STRING(); + } else if (type.is(DataTypeRoot.BINARY)) { + return DataTypes.BINARY(BinaryType.MAX_LENGTH); + } else if (type.is(DataTypeRoot.VARBINARY)) { + return DataTypes.VARBINARY(VarBinaryType.MAX_LENGTH); + } else if (type.is(DataTypeRoot.TIMESTAMP_WITHOUT_TIME_ZONE)) { + return DataTypes.TIMESTAMP(TimestampType.MAX_PRECISION); + } else if (type.is(DataTypeRoot.TIMESTAMP_WITH_TIME_ZONE)) { + return DataTypes.TIMESTAMP_TZ(ZonedTimestampType.MAX_PRECISION); + } else if (type.is(DataTypeRoot.TIMESTAMP_WITH_LOCAL_TIME_ZONE)) { + return DataTypes.TIMESTAMP_LTZ(LocalZonedTimestampType.MAX_PRECISION); + } else { + throw new IllegalArgumentException( + "Unable to merge data types with different precision: " + dataTypes); + } + } + + @VisibleForTesting + static boolean isDataTypeCompatible(@Nullable DataType currentType, DataType upcomingType) { + // If two types are identical, they're compatible of course. + if (Objects.equals(currentType, upcomingType)) { + return true; + } + + // Or, if an upcoming column does not exist in current schema, it can't be compatible. + if (currentType == null) { + return false; + } + + // Or, check if upcomingType is presented in the type merging tree. + return TYPE_MERGING_TREE.get(upcomingType.getClass()).contains(currentType); + } + + @VisibleForTesting + static DataType getLeastCommonType(DataType currentType, DataType targetType) { + // Ignore nullability during data type merge, and restore it later + boolean nullable = currentType.isNullable() || targetType.isNullable(); + currentType = currentType.notNull(); + targetType = targetType.notNull(); + + if (Objects.equals(currentType, targetType)) { + return currentType.copy(nullable); + } + + // For TIMESTAMP and EXACT_NUMERIC types, we have fine-grained type merging logic. + if (currentType.is(DataTypeFamily.TIMESTAMP) && targetType.is(DataTypeFamily.TIMESTAMP)) { + return mergeTimestampType(currentType, targetType).copy(nullable); + } + + if (currentType instanceof DecimalType || targetType instanceof DecimalType) { + return mergeDecimalType(currentType, targetType).copy(nullable); + } + + List currentTypeTree = TYPE_MERGING_TREE.get(currentType.getClass()); + List targetTypeTree = TYPE_MERGING_TREE.get(targetType.getClass()); + + for (DataType type : currentTypeTree) { + if (targetTypeTree.contains(type)) { + return type.copy(nullable); + } + } + + // The most universal type and our final resort: STRING. + return DataTypes.STRING().copy(nullable); + } + + @VisibleForTesting + static DataType mergeTimestampType(DataType lType, DataType rType) { + // TIMESTAMP (0) -> TIMESTAMP_LTZ (1) -> TIMESTAMP_TZ (2) + int leftTypeLevel; + int leftPrecision; + int rightTypeLevel; + int rightPrecision; + + if (lType instanceof TimestampType) { + leftTypeLevel = 0; + leftPrecision = ((TimestampType) lType).getPrecision(); + } else if (lType instanceof LocalZonedTimestampType) { + leftTypeLevel = 1; + leftPrecision = ((LocalZonedTimestampType) lType).getPrecision(); + } else if (lType instanceof ZonedTimestampType) { + leftTypeLevel = 2; + leftPrecision = ((ZonedTimestampType) lType).getPrecision(); + } else { + throw new IllegalArgumentException("Unknown TIMESTAMP type: " + lType); + } + + if (rType instanceof TimestampType) { + rightTypeLevel = 0; + rightPrecision = ((TimestampType) rType).getPrecision(); + } else if (rType instanceof LocalZonedTimestampType) { + rightTypeLevel = 1; + rightPrecision = ((LocalZonedTimestampType) rType).getPrecision(); + } else if (rType instanceof ZonedTimestampType) { + rightTypeLevel = 2; + rightPrecision = ((ZonedTimestampType) rType).getPrecision(); + } else { + throw new IllegalArgumentException("Unknown TIMESTAMP type: " + lType); + } + + int precision = Math.max(leftPrecision, rightPrecision); + + switch (Math.max(leftTypeLevel, rightTypeLevel)) { + case 0: + return DataTypes.TIMESTAMP(precision); + case 1: + return DataTypes.TIMESTAMP_LTZ(precision); + case 2: + return DataTypes.TIMESTAMP_TZ(precision); + default: + throw new IllegalArgumentException("Unreachable"); + } + } + + @VisibleForTesting + static DataType mergeDecimalType(DataType lType, DataType rType) { + if (lType instanceof DecimalType && rType instanceof DecimalType) { + // Merge two decimal types + DecimalType lhsDecimal = (DecimalType) lType; + DecimalType rhsDecimal = (DecimalType) rType; + int resultIntDigits = + Math.max( + lhsDecimal.getPrecision() - lhsDecimal.getScale(), + rhsDecimal.getPrecision() - rhsDecimal.getScale()); + int resultScale = Math.max(lhsDecimal.getScale(), rhsDecimal.getScale()); + Preconditions.checkArgument( + resultIntDigits + resultScale <= DecimalType.MAX_PRECISION, + String.format( + "Failed to merge %s and %s type into DECIMAL. %d precision digits required, %d available", + lType, + rType, + resultIntDigits + resultScale, + DecimalType.MAX_PRECISION)); + return DataTypes.DECIMAL(resultIntDigits + resultScale, resultScale); + } else if (lType instanceof DecimalType && rType.is(DataTypeFamily.EXACT_NUMERIC)) { + // Merge decimal and int + return mergeExactNumericsIntoDecimal((DecimalType) lType, rType); + } else if (rType instanceof DecimalType && lType.is(DataTypeFamily.EXACT_NUMERIC)) { + // Merge decimal and int + return mergeExactNumericsIntoDecimal((DecimalType) rType, lType); + } else { + return DataTypes.STRING(); + } + } + + private static DataType mergeExactNumericsIntoDecimal( + DecimalType decimalType, DataType otherType) { + int resultPrecision = + Math.max( + decimalType.getPrecision(), + decimalType.getScale() + getNumericPrecision(otherType)); + if (resultPrecision <= DecimalType.MAX_PRECISION) { + return DataTypes.DECIMAL(resultPrecision, decimalType.getScale()); + } else { + return DataTypes.STRING(); + } + } + + @VisibleForTesting + public static int getNumericPrecision(DataType dataType) { + if (dataType.is(DataTypeFamily.EXACT_NUMERIC)) { + if (dataType.is(DataTypeRoot.TINYINT)) { + return 3; + } else if (dataType.is(DataTypeRoot.SMALLINT)) { + return 5; + } else if (dataType.is(DataTypeRoot.INTEGER)) { + return 10; + } else if (dataType.is(DataTypeRoot.BIGINT)) { + return 19; + } else if (dataType.is(DataTypeRoot.DECIMAL)) { + return ((DecimalType) dataType).getPrecision(); + } + } + + throw new IllegalArgumentException( + "Failed to get precision of non-exact decimal type " + dataType); + } + + @VisibleForTesting + public static Object coerceObject( + String timezone, + Object originalField, + DataType originalType, + DataType destinationType) { + if (originalField == null) { + return null; + } + + if (destinationType instanceof BooleanType) { + return Boolean.valueOf(originalField.toString()); + } + + if (destinationType instanceof TinyIntType) { + return coerceToByte(originalField); + } + + if (destinationType instanceof SmallIntType) { + return coerceToShort(originalField); + } + + if (destinationType instanceof IntType) { + return coerceToInt(originalField); + } + + if (destinationType instanceof BigIntType) { + return coerceToLong(originalField); + } + + if (destinationType instanceof DecimalType) { + DecimalType decimalType = (DecimalType) destinationType; + return coerceToDecimal( + originalField, decimalType.getPrecision(), decimalType.getScale()); + } + + if (destinationType instanceof FloatType) { + return coerceToFloat(originalField); + } + + if (destinationType instanceof DoubleType) { + return coerceToDouble(originalField); + } + + if (destinationType instanceof CharType) { + return coerceToString(originalField, originalType); + } + + if (destinationType instanceof VarCharType) { + return coerceToString(originalField, originalType); + } + + if (destinationType instanceof BinaryType) { + return coerceToBytes(originalField); + } + + if (destinationType instanceof VarBinaryType) { + return coerceToBytes(originalField); + } + + if (destinationType instanceof DateType) { + return coerceToDate(originalField); + } + + if (destinationType instanceof TimeType) { + return coerceToTime(originalField); + } + + if (destinationType.is(DataTypeRoot.TIMESTAMP_WITHOUT_TIME_ZONE) + && originalType.is(DataTypeRoot.TIMESTAMP_WITHOUT_TIME_ZONE)) { + // For now, TimestampData / ZonedTimestampData / LocalZonedTimestampData has no + // difference in its internal representation, so there's no need to do any precision + // conversion. + return originalField; + } + + if (destinationType.is(DataTypeRoot.TIMESTAMP_WITH_TIME_ZONE) + && originalType.is(DataTypeRoot.TIMESTAMP_WITH_TIME_ZONE)) { + return originalField; + } + + if (destinationType.is(DataTypeRoot.TIMESTAMP_WITH_LOCAL_TIME_ZONE) + && originalType.is(DataTypeRoot.TIMESTAMP_WITH_LOCAL_TIME_ZONE)) { + return originalField; + } + + if (destinationType instanceof TimestampType) { + return coerceToTimestamp(originalField, timezone); + } + + if (destinationType instanceof LocalZonedTimestampType) { + return coerceToLocalZonedTimestamp(originalField, timezone); + } + + if (destinationType instanceof ZonedTimestampType) { + return coerceToZonedTimestamp(originalField, timezone); + } + + throw new IllegalArgumentException( + String.format( + "Column type \"%s\" doesn't support type coercion to \"%s\"", + originalType, destinationType)); + } + + private static Object coerceToString(Object originalField, DataType originalType) { + if (originalField == null) { + return BinaryStringData.fromString("null"); + } + + if (originalField instanceof StringData) { + return originalField; + } + + if (originalType instanceof DateType || originalType instanceof TimeType) { + return BinaryStringData.fromString(originalField.toString()); + } + + if (originalField instanceof byte[]) { + return BinaryStringData.fromString(hexlify((byte[]) originalField)); + } + + if (originalField instanceof Variant) { + return BinaryStringData.fromString(((Variant) originalField).toJson()); + } + + return BinaryStringData.fromString(originalField.toString()); + } + + private static Object coerceToBytes(Object originalField) { + if (originalField instanceof byte[]) { + return originalField; + } else { + return originalField.toString().getBytes(); + } + } + + private static byte coerceToByte(Object o) { + if (o instanceof Byte) { + return (Byte) o; + } else { + throw new IllegalArgumentException( + String.format("Cannot fit type \"%s\" into a TINYINT column. ", o.getClass())); + } + } + + private static short coerceToShort(Object o) { + if (o instanceof Byte) { + return ((Byte) o).shortValue(); + } else if (o instanceof Short) { + return (Short) o; + } else { + throw new IllegalArgumentException( + String.format( + "Cannot fit type \"%s\" into a SMALLINT column. " + + "Currently only TINYINT can be accepted by a SMALLINT column", + o.getClass())); + } + } + + private static int coerceToInt(Object o) { + if (o instanceof Byte) { + return ((Byte) o).intValue(); + } else if (o instanceof Short) { + return ((Short) o).intValue(); + } else if (o instanceof Integer) { + return (Integer) o; + } else { + throw new IllegalArgumentException( + String.format( + "Cannot fit type \"%s\" into a INT column. " + + "Currently only TINYINT / SMALLINT can be accepted by a INT column", + o.getClass())); + } + } + + private static long coerceToLong(Object o) { + if (o instanceof Byte) { + return ((Byte) o).longValue(); + } else if (o instanceof Short) { + return ((Short) o).longValue(); + } else if (o instanceof Integer) { + return ((Integer) o).longValue(); + } else if (o instanceof Long) { + return (long) o; + } else { + throw new IllegalArgumentException( + String.format( + "Cannot fit type \"%s\" into a BIGINT column. " + + "Currently only TINYINT / SMALLINT / INT can be accepted by a BIGINT column", + o.getClass())); + } + } + + private static DecimalData coerceToDecimal(Object o, int precision, int scale) { + BigDecimal decimalValue; + if (o instanceof Byte) { + decimalValue = BigDecimal.valueOf(((Byte) o).longValue(), 0); + } else if (o instanceof Short) { + decimalValue = BigDecimal.valueOf(((Short) o).longValue(), 0); + } else if (o instanceof Integer) { + decimalValue = BigDecimal.valueOf(((Integer) o).longValue(), 0); + } else if (o instanceof Long) { + decimalValue = BigDecimal.valueOf((Long) o, 0); + } else if (o instanceof DecimalData) { + decimalValue = ((DecimalData) o).toBigDecimal(); + } else { + throw new IllegalArgumentException( + String.format( + "Cannot fit type \"%s\" into a DECIMAL column. " + + "Currently only TINYINT / SMALLINT / INT / BIGINT / DECIMAL can be accepted by a DECIMAL column", + o.getClass())); + } + return decimalValue != null + ? DecimalData.fromBigDecimal(decimalValue, precision, scale) + : null; + } + + private static float coerceToFloat(Object o) { + if (o instanceof Byte) { + return ((Byte) o).floatValue(); + } else if (o instanceof Short) { + return ((Short) o).floatValue(); + } else if (o instanceof Integer) { + return ((Integer) o).floatValue(); + } else if (o instanceof Long) { + return ((Long) o).floatValue(); + } else if (o instanceof DecimalData) { + return ((DecimalData) o).toBigDecimal().floatValue(); + } else if (o instanceof Float) { + return (Float) o; + } else { + throw new IllegalArgumentException( + String.format( + "Cannot fit type \"%s\" into a FLOAT column. " + + "Currently only TINYINT / SMALLINT / INT / BIGINT / DECIMAL can be accepted by a FLOAT column", + o.getClass())); + } + } + + private static double coerceToDouble(Object o) { + if (o instanceof Byte) { + return ((Byte) o).doubleValue(); + } else if (o instanceof Short) { + return ((Short) o).doubleValue(); + } else if (o instanceof Integer) { + return ((Integer) o).doubleValue(); + } else if (o instanceof Long) { + return ((Long) o).doubleValue(); + } else if (o instanceof DecimalData) { + return ((DecimalData) o).toBigDecimal().doubleValue(); + } else if (o instanceof Float) { + return ((Float) o).doubleValue(); + } else if (o instanceof Double) { + return (Double) o; + } else { + throw new IllegalArgumentException( + String.format( + "Cannot fit type \"%s\" into a DOUBLE column. " + + "Currently only TINYINT / SMALLINT / INT / BIGINT / DECIMAL / FLOAT can be accepted by a DOUBLE column", + o.getClass())); + } + } + + private static DateData coerceToDate(Object o) { + if (o == null) { + return null; + } + if (o instanceof DateData) { + return (DateData) o; + } + if (o instanceof Number) { + return DateData.fromEpochDay(((Number) o).intValue()); + } + if (o instanceof String) { + return DateData.fromIsoLocalDateString((String) o); + } + if (o instanceof LocalDate) { + return DateData.fromLocalDate((LocalDate) o); + } + if (o instanceof LocalDateTime) { + return DateData.fromLocalDate(((LocalDateTime) o).toLocalDate()); + } + throw new IllegalArgumentException( + String.format("Cannot fit type \"%s\" into a DATE column. ", o.getClass())); + } + + private static TimeData coerceToTime(Object o) { + if (o == null) { + return null; + } + if (o instanceof TimeData) { + return (TimeData) o; + } + if (o instanceof Number) { + return TimeData.fromNanoOfDay(((Number) o).longValue()); + } + if (o instanceof String) { + return TimeData.fromIsoLocalTimeString((String) o); + } + if (o instanceof LocalTime) { + return TimeData.fromLocalTime((LocalTime) o); + } + if (o instanceof LocalDateTime) { + return TimeData.fromLocalTime(((LocalDateTime) o).toLocalTime()); + } + throw new IllegalArgumentException( + String.format("Cannot fit type \"%s\" into a TIME column. ", o.getClass())); + } + + private static TimestampData coerceToTimestamp(Object object, String timezone) { + if (object == null) { + return null; + } + if (object instanceof Long) { + return TimestampData.fromLocalDateTime( + LocalDate.ofEpochDay((long) object).atStartOfDay()); + } else if (object instanceof LocalZonedTimestampData) { + return TimestampData.fromLocalDateTime( + LocalDateTime.ofInstant( + ((LocalZonedTimestampData) object).toInstant(), ZoneId.of(timezone))); + } else if (object instanceof ZonedTimestampData) { + return TimestampData.fromLocalDateTime( + LocalDateTime.ofInstant( + ((ZonedTimestampData) object).toInstant(), ZoneId.of(timezone))); + } else if (object instanceof TimestampData) { + return (TimestampData) object; + } else if (object instanceof DateData) { + return TimestampData.fromLocalDateTime( + ((DateData) object).toLocalDate().atStartOfDay()); + } else { + throw new IllegalArgumentException( + String.format( + "Unable to implicitly coerce object `%s` as a TIMESTAMP.", object)); + } + } + + private static LocalZonedTimestampData coerceToLocalZonedTimestamp( + Object object, String timezone) { + if (object == null) { + return null; + } + + TimestampData timestampData = coerceToTimestamp(object, timezone); + return LocalZonedTimestampData.fromEpochMillis( + timestampData.getMillisecond(), timestampData.getNanoOfMillisecond()); + } + + private static ZonedTimestampData coerceToZonedTimestamp(Object object, String timezone) { + if (object == null) { + return null; + } + + TimestampData timestampData = coerceToTimestamp(object, timezone); + return ZonedTimestampData.fromZonedDateTime( + ZonedDateTime.ofInstant( + timestampData.toLocalDateTime().toInstant(ZoneOffset.UTC), + ZoneId.of(timezone))); + } + + private static String hexlify(byte[] bytes) { + return BaseEncoding.base64().encode(bytes); + } + + private static final Map, List> TYPE_MERGING_TREE = + getTypeMergingTree(); + + private static Map, List> getTypeMergingTree() { + DataType stringType = DataTypes.STRING(); + DataType doubleType = DataTypes.DOUBLE(); + DataType floatType = DataTypes.FLOAT(); + DataType decimalType = + DataTypes.DECIMAL(DecimalType.MAX_PRECISION, DecimalType.DEFAULT_SCALE); + DataType bigIntType = DataTypes.BIGINT(); + DataType intType = DataTypes.INT(); + DataType smallIntType = DataTypes.SMALLINT(); + DataType tinyIntType = DataTypes.TINYINT(); + DataType timestampTzType = DataTypes.TIMESTAMP_TZ(ZonedTimestampType.MAX_PRECISION); + DataType timestampLtzType = DataTypes.TIMESTAMP_LTZ(LocalZonedTimestampType.MAX_PRECISION); + DataType timestampType = DataTypes.TIMESTAMP(TimestampType.MAX_PRECISION); + DataType dateType = DataTypes.DATE(); + + Map, List> mergingTree = new HashMap<>(); + + // Simple data types + mergingTree.put(VarCharType.class, ImmutableList.of(stringType)); + mergingTree.put(CharType.class, ImmutableList.of(stringType)); + mergingTree.put(BooleanType.class, ImmutableList.of(stringType)); + mergingTree.put(BinaryType.class, ImmutableList.of(stringType)); + mergingTree.put(VarBinaryType.class, ImmutableList.of(stringType)); + mergingTree.put(DoubleType.class, ImmutableList.of(doubleType, stringType)); + mergingTree.put(FloatType.class, ImmutableList.of(floatType, doubleType, stringType)); + mergingTree.put(DecimalType.class, ImmutableList.of(stringType)); + mergingTree.put( + BigIntType.class, + ImmutableList.of(bigIntType, decimalType, doubleType, stringType)); + mergingTree.put( + IntType.class, + ImmutableList.of(intType, bigIntType, decimalType, doubleType, stringType)); + mergingTree.put( + SmallIntType.class, + ImmutableList.of( + smallIntType, + intType, + bigIntType, + decimalType, + floatType, + doubleType, + stringType)); + mergingTree.put( + TinyIntType.class, + ImmutableList.of( + tinyIntType, + smallIntType, + intType, + bigIntType, + decimalType, + floatType, + doubleType, + stringType)); + + // Timestamp series + mergingTree.put(ZonedTimestampType.class, ImmutableList.of(timestampTzType, stringType)); + mergingTree.put( + LocalZonedTimestampType.class, + ImmutableList.of(timestampLtzType, timestampTzType, stringType)); + mergingTree.put( + TimestampType.class, + ImmutableList.of(timestampType, timestampLtzType, timestampTzType, stringType)); + mergingTree.put( + DateType.class, + ImmutableList.of( + dateType, timestampType, timestampLtzType, timestampTzType, stringType)); + mergingTree.put(TimeType.class, ImmutableList.of(stringType)); + + // Complex types + mergingTree.put(RowType.class, ImmutableList.of(stringType)); + mergingTree.put(ArrayType.class, ImmutableList.of(stringType)); + mergingTree.put(MapType.class, ImmutableList.of(stringType)); + mergingTree.put(VariantType.class, ImmutableList.of(stringType)); + return mergingTree; + } +} diff --git a/flink-cdc-common-2.x/src/test/java/org/apache/flink/cdc/common/factories/FactoryHelperTests.java b/flink-cdc-common-2.x/src/test/java/org/apache/flink/cdc/common/factories/FactoryHelperTests.java new file mode 100644 index 00000000000..3c2c82af1cd --- /dev/null +++ b/flink-cdc-common-2.x/src/test/java/org/apache/flink/cdc/common/factories/FactoryHelperTests.java @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.factories; + +import org.apache.flink.cdc.common.configuration.ConfigOption; +import org.apache.flink.cdc.common.configuration.ConfigOptions; +import org.apache.flink.cdc.common.configuration.Configuration; +import org.apache.flink.table.api.ValidationException; + +import org.apache.flink.shaded.guava33.com.google.common.collect.Sets; + +import org.assertj.core.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +/** Tests for {@link FactoryHelper}. */ +class FactoryHelperTests { + + private Factory getDummyFactory() { + + return new Factory() { + @Override + public String identifier() { + return "dummy"; + } + + @Override + public Set> requiredOptions() { + return Sets.newHashSet( + ConfigOptions.key("id") + .intType() + .noDefaultValue() + .withFallbackKeys("id_fallback"), + ConfigOptions.key("name").stringType().noDefaultValue(), + ConfigOptions.key("age").doubleType().noDefaultValue()); + } + + @Override + public Set> optionalOptions() { + return Sets.newHashSet( + ConfigOptions.key("hobby") + .stringType() + .noDefaultValue() + .withFallbackKeys("hobby_fallback"), + ConfigOptions.key("location").stringType().defaultValue("Everywhere"), + ConfigOptions.key("misc") + .mapType() + .defaultValue(Collections.singletonMap("A", "Z"))); + } + }; + } + + @Test + void testCorrectConfigValidation() { + // This is a valid configuration. + Map configurations = new HashMap<>(); + configurations.put("id", "1"); + configurations.put("name", "Alice"); + configurations.put("age", "17"); + configurations.put("location", "Here"); + + FactoryHelper factoryHelper = + FactoryHelper.createFactoryHelper( + getDummyFactory(), + new FactoryHelper.DefaultContext( + Configuration.fromMap(configurations), null, null)); + + factoryHelper.validate(); + + // Validation for fallback keys. + configurations.clear(); + configurations.put("id_fallback", "2"); + configurations.put("name", "Bob"); + configurations.put("age", "18"); + configurations.put("hobby_fallback", "Swimming"); + factoryHelper = + FactoryHelper.createFactoryHelper( + getDummyFactory(), + new FactoryHelper.DefaultContext( + Configuration.fromMap(configurations), null, null)); + factoryHelper.validate(); + } + + @Test + void testMissingRequiredOptionConfigValidation() { + // This configuration doesn't provide all required options. + Map configurations = new HashMap<>(); + configurations.put("id", "1"); + configurations.put("age", "17"); + configurations.put("location", "Here"); + + FactoryHelper factoryHelper = + FactoryHelper.createFactoryHelper( + getDummyFactory(), + new FactoryHelper.DefaultContext( + Configuration.fromMap(configurations), null, null)); + + Assertions.assertThatThrownBy(factoryHelper::validate) + .isExactlyInstanceOf(ValidationException.class) + .hasMessageContaining("One or more required options are missing."); + } + + @Test + void testIncompatibleTypeValidation() { + // This configuration has an option with mismatched type. + Map configurations = new HashMap<>(); + configurations.put("id", "1"); + configurations.put("name", "Alice"); + configurations.put("age", "Not a number"); + configurations.put("location", "Here"); + + FactoryHelper factoryHelper = + FactoryHelper.createFactoryHelper( + getDummyFactory(), + new FactoryHelper.DefaultContext( + Configuration.fromMap(configurations), null, null)); + + Assertions.assertThatThrownBy(factoryHelper::validate) + .isExactlyInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Could not parse value 'Not a number' for key 'age'."); + } + + @Test + void testRedundantConfigValidation() { + // This configuration has redundant config options. + Map configurations = new HashMap<>(); + configurations.put("id", "1"); + configurations.put("name", "Alice"); + configurations.put("age", "17"); + configurations.put("what", "Not a valid configOption"); + + FactoryHelper factoryHelper = + FactoryHelper.createFactoryHelper( + getDummyFactory(), + new FactoryHelper.DefaultContext( + Configuration.fromMap(configurations), null, null)); + + Assertions.assertThatThrownBy(factoryHelper::validate) + .isExactlyInstanceOf(ValidationException.class) + .hasMessageContaining("Unsupported options found for 'dummy'."); + } + + @Test + void testAllowedPrefixConfigValidation() { + // This configuration has allowed prefix options. + Map configurations = new HashMap<>(); + configurations.put("id", "1"); + configurations.put("name", "Alice"); + configurations.put("age", "17"); + configurations.put("debezium.foo", "Some debezium options"); + configurations.put("debezium.bar", "Another debezium options"); + configurations.put("canal.baz", "Yet another debezium options"); + + FactoryHelper factoryHelper = + FactoryHelper.createFactoryHelper( + getDummyFactory(), + new FactoryHelper.DefaultContext( + Configuration.fromMap(configurations), null, null)); + + Assertions.assertThatThrownBy(factoryHelper::validate) + .isExactlyInstanceOf(ValidationException.class) + .hasMessageContaining("Unsupported options found for 'dummy'."); + + Assertions.assertThatThrownBy(() -> factoryHelper.validateExcept("debezium.")) + .isExactlyInstanceOf(ValidationException.class) + .hasMessageContaining("Unsupported options found for 'dummy'."); + + Assertions.assertThatThrownBy(() -> factoryHelper.validateExcept("canal.")) + .isExactlyInstanceOf(ValidationException.class) + .hasMessageContaining("Unsupported options found for 'dummy'."); + + factoryHelper.validateExcept("debezium.", "canal."); + } +} diff --git a/flink-cdc-common-2.x/src/test/java/org/apache/flink/cdc/common/schema/SelectorsTest.java b/flink-cdc-common-2.x/src/test/java/org/apache/flink/cdc/common/schema/SelectorsTest.java new file mode 100644 index 00000000000..c2ada3b191b --- /dev/null +++ b/flink-cdc-common-2.x/src/test/java/org/apache/flink/cdc/common/schema/SelectorsTest.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.schema; + +import org.apache.flink.cdc.common.event.TableId; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Test for {@link org.apache.flink.cdc.common.schema.Selectors}. */ +class SelectorsTest { + + @Test + void testTableSelector() { + + // nameSpace, schemaName, tableName + Selectors selectors = + new Selectors.SelectorsBuilder() + .includeTables("db.sc1.A[0-9]+,db.sc2.B[0-1]+,db.sc1.sc1") + .build(); + + assertAllowed(selectors, "db", "sc1", "sc1"); + assertAllowed(selectors, "db", "sc1", "A1"); + assertAllowed(selectors, "db", "sc1", "A2"); + assertAllowed(selectors, "db", "sc2", "B0"); + assertAllowed(selectors, "db", "sc2", "B1"); + assertNotAllowed(selectors, "db", "sc1", "A"); + assertNotAllowed(selectors, "db", "sc1a", "B"); + assertNotAllowed(selectors, "db", "sc1", "AA"); + assertNotAllowed(selectors, "db", "sc2", "B2"); + assertNotAllowed(selectors, "db2", "sc1", "A1"); + assertNotAllowed(selectors, "db2", "sc1", "A2"); + assertNotAllowed(selectors, "db", "sc11", "A1"); + assertNotAllowed(selectors, "db", "sc1A", "A1"); + + selectors = + new Selectors.SelectorsBuilder() + .includeTables("db\\..sc1.A[0-9]+,db.sc2.B[0-1]+,db\\..sc1.sc1,db.sc1.sc1") + .build(); + + assertAllowed(selectors, "db", "sc1", "sc1"); + assertAllowed(selectors, "db1", "sc1", "sc1"); + assertAllowed(selectors, "dba", "sc1", "sc1"); + assertAllowed(selectors, "db1", "sc1", "A1"); + assertAllowed(selectors, "dba", "sc1", "A2"); + assertAllowed(selectors, "db", "sc2", "B0"); + assertAllowed(selectors, "db", "sc2", "B1"); + assertNotAllowed(selectors, "db", "sc1", "A"); + assertNotAllowed(selectors, "db", "sc1a", "B"); + assertNotAllowed(selectors, "db", "sc1", "AA"); + assertNotAllowed(selectors, "db", "sc2", "B2"); + assertNotAllowed(selectors, "dba1", "sc1", "A1"); + assertNotAllowed(selectors, "dba2", "sc1", "A2"); + assertNotAllowed(selectors, "db", "sc11", "A1"); + assertNotAllowed(selectors, "db", "sc1A", "A1"); + + // schemaName, tableName + selectors = + new Selectors.SelectorsBuilder() + .includeTables("sc1.A[0-9]+,sc2.B[0-1]+,sc1.sc1") + .build(); + + assertAllowed(selectors, null, "sc1", "sc1"); + assertAllowed(selectors, null, "sc1", "A1"); + assertAllowed(selectors, null, "sc1", "A2"); + assertAllowed(selectors, null, "sc2", "B0"); + assertAllowed(selectors, null, "sc2", "B1"); + assertNotAllowed(selectors, "db", "sc1", "A1"); + assertNotAllowed(selectors, null, "sc1", "A"); + assertNotAllowed(selectors, null, "sc2", "B"); + assertNotAllowed(selectors, null, "sc1", "AA"); + assertNotAllowed(selectors, null, "sc11", "A1"); + assertNotAllowed(selectors, null, "sc1A", "A1"); + + // tableName + selectors = + new Selectors.SelectorsBuilder().includeTables("\\.A[0-9]+,B[0-1]+,sc1").build(); + + assertAllowed(selectors, null, null, "sc1"); + assertNotAllowed(selectors, "db", "sc1", "sc1"); + assertNotAllowed(selectors, null, "sc1", "sc1"); + assertAllowed(selectors, null, null, "1A1"); + assertAllowed(selectors, null, null, "AA2"); + assertAllowed(selectors, null, null, "B0"); + assertAllowed(selectors, null, null, "B1"); + assertNotAllowed(selectors, "db", "sc1", "A1"); + assertNotAllowed(selectors, null, null, "A"); + assertNotAllowed(selectors, null, null, "B"); + assertNotAllowed(selectors, null, null, "2B"); + + selectors = + new Selectors.SelectorsBuilder() + .includeTables("sc1.A[0-9]+,sc2.B[0-1]+,sc1.sc1") + .build(); + + assertAllowed(selectors, null, "sc1", "sc1"); + assertAllowed(selectors, null, "sc1", "A1"); + assertAllowed(selectors, null, "sc1", "A2"); + assertAllowed(selectors, null, "sc1", "A2"); + assertAllowed(selectors, null, "sc2", "B0"); + assertNotAllowed(selectors, "db", "sc1", "A1"); + assertNotAllowed(selectors, null, "sc1", "A"); + assertNotAllowed(selectors, null, "sc1", "AA"); + assertNotAllowed(selectors, null, "sc2", "B"); + assertNotAllowed(selectors, null, "sc2", "B2"); + assertNotAllowed(selectors, null, "sc11", "A1"); + assertNotAllowed(selectors, null, "sc1A", "A1"); + + selectors = new Selectors.SelectorsBuilder().includeTables("sc1.sc1").build(); + assertAllowed(selectors, null, "sc1", "sc1"); + + selectors = new Selectors.SelectorsBuilder().includeTables("sc1.sc[0-9]+").build(); + assertAllowed(selectors, null, "sc1", "sc1"); + + selectors = new Selectors.SelectorsBuilder().includeTables("sc1.\\.*").build(); + assertAllowed(selectors, null, "sc1", "sc1"); + } + + protected void assertAllowed( + Selectors filter, String nameSpace, String schemaName, String tableName) { + + TableId id = getTableId(nameSpace, schemaName, tableName); + + assertThat(filter.isMatch(id)).isTrue(); + } + + protected void assertNotAllowed( + Selectors filter, String nameSpace, String schemaName, String tableName) { + + TableId id = getTableId(nameSpace, schemaName, tableName); + + assertThat(filter.isMatch(id)).isFalse(); + } + + private static TableId getTableId(String nameSpace, String schemaName, String tableName) { + TableId id; + if (nameSpace == null && schemaName == null) { + id = TableId.tableId(tableName); + } else if (nameSpace == null) { + id = TableId.tableId(schemaName, tableName); + } else { + id = TableId.tableId(nameSpace, schemaName, tableName); + } + return id; + } +} diff --git a/flink-cdc-common-2.x/src/test/java/org/apache/flink/cdc/common/types/variant/BinaryVariantInternalBuilderTest.java b/flink-cdc-common-2.x/src/test/java/org/apache/flink/cdc/common/types/variant/BinaryVariantInternalBuilderTest.java new file mode 100644 index 00000000000..b3667d0e50d --- /dev/null +++ b/flink-cdc-common-2.x/src/test/java/org/apache/flink/cdc/common/types/variant/BinaryVariantInternalBuilderTest.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.types.variant; + +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.math.BigDecimal; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +class BinaryVariantInternalBuilderTest { + + @Test + void testParseScalarJson() throws IOException { + assertThat(BinaryVariantInternalBuilder.parseJson("1", false).getByte()) + .isEqualTo((byte) 1); + short s = (short) (Byte.MAX_VALUE + 1L); + assertThat(BinaryVariantInternalBuilder.parseJson(String.valueOf(s), false).getShort()) + .isEqualTo(s); + int i = (int) (Short.MAX_VALUE + 1L); + assertThat(BinaryVariantInternalBuilder.parseJson(String.valueOf(i), false).getInt()) + .isEqualTo(i); + long l = Integer.MAX_VALUE + 1L; + assertThat(BinaryVariantInternalBuilder.parseJson(String.valueOf(l), false).getLong()) + .isEqualTo(l); + + BigDecimal bigDecimal = BigDecimal.valueOf(Long.MAX_VALUE).add(BigDecimal.ONE); + assertThat( + BinaryVariantInternalBuilder.parseJson(bigDecimal.toPlainString(), false) + .getDecimal()) + .isEqualTo(bigDecimal); + + assertThat(BinaryVariantInternalBuilder.parseJson("1.123", false).getDecimal()) + .isEqualTo(BigDecimal.valueOf(1.123)); + assertThat( + BinaryVariantInternalBuilder.parseJson( + String.valueOf(Double.MAX_VALUE), false) + .getDouble()) + .isEqualTo(Double.MAX_VALUE); + + assertThat(BinaryVariantInternalBuilder.parseJson("\"hello\"", false).getString()) + .isEqualTo("hello"); + + assertThat(BinaryVariantInternalBuilder.parseJson("true", false).getBoolean()).isTrue(); + + assertThat(BinaryVariantInternalBuilder.parseJson("false", false).getBoolean()).isFalse(); + + assertThat(BinaryVariantInternalBuilder.parseJson("null", false).isNull()).isTrue(); + } + + @Test + void testParseJsonArray() throws IOException { + BinaryVariant variant = BinaryVariantInternalBuilder.parseJson("[]", false); + assertThat(variant.getElement(0)).isNull(); + + variant = BinaryVariantInternalBuilder.parseJson("[1,\"hello\",3.1, null]", false); + assertThat(variant.getElement(0).getByte()).isEqualTo((byte) 1); + assertThat(variant.getElement(1).getString()).isEqualTo("hello"); + assertThat(variant.getElement(2).getDecimal()).isEqualTo(BigDecimal.valueOf(3.1)); + assertThat(variant.getElement(3).isNull()).isTrue(); + + variant = BinaryVariantInternalBuilder.parseJson("[1,[\"hello\",[3.1]]]", false); + assertThat(variant.getElement(0).getByte()).isEqualTo((byte) 1); + assertThat(variant.getElement(1).getElement(0).getString()).isEqualTo("hello"); + assertThat(variant.getElement(1).getElement(1).getElement(0).getDecimal()) + .isEqualTo(BigDecimal.valueOf(3.1)); + } + + @Test + void testParseJsonObject() throws IOException { + BinaryVariant variant = BinaryVariantInternalBuilder.parseJson("{}", false); + assertThat(variant.getField("a")).isNull(); + + variant = + BinaryVariantInternalBuilder.parseJson( + "{\"a\":1,\"b\":\"hello\",\"c\":3.1}", false); + + assertThat(variant.getField("a").getByte()).isEqualTo((byte) 1); + assertThat(variant.getField("b").getString()).isEqualTo("hello"); + assertThat(variant.getField("c").getDecimal()).isEqualTo(BigDecimal.valueOf(3.1)); + + variant = + BinaryVariantInternalBuilder.parseJson( + "{\"a\":1,\"b\":{\"c\":\"hello\",\"d\":[3.1]}}", false); + assertThat(variant.getField("a").getByte()).isEqualTo((byte) 1); + assertThat(variant.getField("b").getField("c").getString()).isEqualTo("hello"); + assertThat(variant.getField("b").getField("d").getElement(0).getDecimal()) + .isEqualTo(BigDecimal.valueOf(3.1)); + + assertThatThrownBy( + () -> + BinaryVariantInternalBuilder.parseJson( + "{\"k1\":1,\"k1\":2,\"k2\":1.5}", false)) + .isInstanceOf(VariantTypeException.class) + .hasMessage("VARIANT_DUPLICATE_KEY"); + + variant = BinaryVariantInternalBuilder.parseJson("{\"k1\":1,\"k1\":2,\"k2\":1.5}", true); + assertThat(variant.getField("k1").getByte()).isEqualTo((byte) 2); + assertThat(variant.getField("k2").getDecimal()).isEqualTo(BigDecimal.valueOf(1.5)); + } +} diff --git a/flink-cdc-common-2.x/src/test/java/org/apache/flink/cdc/common/types/variant/BinaryVariantTest.java b/flink-cdc-common-2.x/src/test/java/org/apache/flink/cdc/common/types/variant/BinaryVariantTest.java new file mode 100644 index 00000000000..41726d14a0c --- /dev/null +++ b/flink-cdc-common-2.x/src/test/java/org/apache/flink/cdc/common/types/variant/BinaryVariantTest.java @@ -0,0 +1,265 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.types.variant; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.math.BigDecimal; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.temporal.ChronoUnit; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +class BinaryVariantTest { + + private BinaryVariantBuilder builder; + + @BeforeEach + void setUp() { + builder = new BinaryVariantBuilder(); + } + + @Test + void testScalarVariant() { + + assertThat(builder.of((byte) 10).isPrimitive()).isTrue(); + assertThat(builder.of((byte) 10).isNull()).isFalse(); + assertThat(builder.of((byte) 10).isArray()).isFalse(); + assertThat(builder.of((byte) 10).isObject()).isFalse(); + assertThat(builder.of((byte) 10).getType()).isEqualTo(Variant.Type.TINYINT); + + assertThat(builder.of((byte) 10).getByte()).isEqualTo((byte) 10); + assertThat(builder.of((byte) 10).get()).isEqualTo((byte) 10); + assertThat((byte) builder.of((byte) 10).getAs()).isEqualTo((byte) 10); + + assertThat(builder.of((short) 10).getShort()).isEqualTo((short) 10); + assertThat(builder.of((short) 10).get()).isEqualTo((short) 10); + + assertThat(builder.of(10).getInt()).isEqualTo(10); + assertThat(builder.of(10).get()).isEqualTo(10); + + assertThat(builder.of(10L).getLong()).isEqualTo(10L); + assertThat(builder.of(10L).get()).isEqualTo(10L); + + assertThat(builder.of(10.0).getDouble()).isEqualTo(10.0d); + assertThat(builder.of(10.0).get()).isEqualTo(10.0d); + + assertThat(builder.of(10.0f).getFloat()).isEqualTo(10.0f); + assertThat(builder.of(10.0f).get()).isEqualTo(10.0f); + + assertThat(builder.of("hello").getString()).isEqualTo("hello"); + assertThat(builder.of("hello").get()).isEqualTo("hello"); + + assertThat(builder.of("hello".getBytes()).getBytes()).isEqualTo("hello".getBytes()); + assertThat(builder.of("hello".getBytes()).get()).isEqualTo("hello".getBytes()); + + assertThat(builder.of(true).getBoolean()).isTrue(); + assertThat(builder.of(true).get()).isEqualTo(true); + + assertThat(builder.of(BigDecimal.valueOf(100)).getDecimal()) + .isEqualByComparingTo(BigDecimal.valueOf(100)); + assertThat((BigDecimal) builder.of(BigDecimal.valueOf(100)).get()) + .isEqualByComparingTo(BigDecimal.valueOf(100)); + + Instant instant = Instant.now().truncatedTo(ChronoUnit.MICROS); + assertThat(builder.of(instant).getInstant()).isEqualTo(instant); + assertThat(builder.of(instant).get()).isEqualTo(instant); + + LocalDateTime localDateTime = LocalDateTime.now().truncatedTo(ChronoUnit.MICROS); + assertThat(builder.of(localDateTime).getDateTime()).isEqualTo(localDateTime); + assertThat(builder.of(localDateTime).get()).isEqualTo(localDateTime); + + LocalDate localDate = LocalDate.now(); + assertThat(builder.of(localDate).getDate()).isEqualTo(localDate); + assertThat(builder.of(localDate).get()).isEqualTo(localDate); + + assertThat(builder.ofNull().get()).isEqualTo(null); + assertThat(builder.ofNull().isNull()).isTrue(); + } + + @Test + void testArrayVariant() { + Instant now = Instant.now().truncatedTo(ChronoUnit.MICROS); + Variant variant = + builder.array() + .add(builder.of(1)) + .add(builder.of("hello")) + .add(builder.of(now)) + .add(builder.array().add(builder.of("hello2")).add(builder.of(10f)).build()) + .add(builder.ofNull()) + .build(); + + assertThat(variant.isArray()).isTrue(); + assertThat(variant.isPrimitive()).isFalse(); + assertThat(variant.isObject()).isFalse(); + assertThat(variant.getType()).isEqualTo(Variant.Type.ARRAY); + + assertThat(variant.getElement(-1)).isNull(); + assertThat(variant.getElement(0).getInt()).isEqualTo(1); + assertThat(variant.getElement(1).getString()).isEqualTo("hello"); + assertThat(variant.getElement(2).getInstant()).isEqualTo(now); + assertThat(variant.getElement(3).getElement(0).getString()).isEqualTo("hello2"); + assertThat(variant.getElement(3).getElement(1).getFloat()).isEqualTo(10f); + assertThat(variant.getElement(4).isNull()).isTrue(); + assertThat(variant.getElement(5)).isNull(); + } + + @Test + void testObjectVariant() { + Variant variant = + builder.object() + .add( + "list", + builder.array().add(builder.of("hello")).add(builder.of(1)).build()) + .add( + "object", + builder.object() + .add("ss", builder.of((short) 1)) + .add("ff", builder.of(10.0f)) + .build()) + .add("bb", builder.of((byte) 10)) + .build(); + + assertThat(variant.isArray()).isFalse(); + assertThat(variant.isPrimitive()).isFalse(); + assertThat(variant.isObject()).isTrue(); + assertThat(variant.getType()).isEqualTo(Variant.Type.OBJECT); + + assertThat(variant.getField("list").isArray()).isTrue(); + assertThat(variant.getField("list").getElement(0).getString()).isEqualTo("hello"); + assertThat(variant.getField("list").getElement(1).getInt()).isEqualTo(1); + + assertThat(variant.getField("object").isObject()).isTrue(); + assertThat(variant.getField("object").getField("ss").getShort()).isEqualTo((short) 1); + assertThat(variant.getField("object").getField("ff").getFloat()).isEqualTo((10.0f)); + + assertThat(variant.getField("bb").getByte()).isEqualTo((byte) 10); + assertThat(variant.getField("non_exist")).isNull(); + + BinaryVariantBuilder.VariantObjectBuilder objectBuilder = builder.object(); + + for (int i = 0; i < 100; i++) { + objectBuilder.add(String.valueOf(i), builder.of(i)); + } + variant = objectBuilder.build(); + for (int i = 0; i < 100; i++) { + assertThat(variant.getField(String.valueOf(i)).getInt()).isEqualTo(i); + } + } + + @Test + void testDuplicatedKeyObjectVariant() { + assertThatThrownBy( + () -> + builder.object(false) + .add("k", builder.of((byte) 10)) + .add("k", builder.of("hello")) + .build()) + .isInstanceOf(RuntimeException.class) + .hasMessage("VARIANT_DUPLICATE_KEY"); + + Variant variant = + builder.object(true) + .add("k", builder.of((byte) 10)) + .add("k", builder.of("hello")) + .add("k1", builder.of(10)) + .build(); + + assertThat(variant.getField("k").getString()).isEqualTo("hello"); + assertThat(variant.getField("k1").getInt()).isEqualTo(10); + } + + @Test + void testToJsonScalar() { + Instant instant = Instant.EPOCH; + LocalDateTime localDateTime = LocalDateTime.of(2000, 1, 1, 0, 0); + LocalDate localDate = LocalDate.of(2000, 1, 1); + + assertThat(builder.of((byte) 1).toJson()).isEqualTo("1"); + assertThat(builder.of((short) 1).toJson()).isEqualTo("1"); + assertThat(builder.of(1L).toJson()).isEqualTo("1"); + assertThat(builder.of(1).toJson()).isEqualTo("1"); + assertThat(builder.of("hello").toJson()).isEqualTo("\"hello\""); + assertThat(builder.of(true).toJson()).isEqualTo("true"); + assertThat(builder.of(10.0f).toJson()).isEqualTo("10.0"); + assertThat(builder.of(10.0d).toJson()).isEqualTo("10.0"); + assertThat(builder.of(BigDecimal.valueOf(100)).toJson()).isEqualTo("100"); + assertThat(builder.of(instant).toJson()).isEqualTo("\"1970-01-01T00:00:00+00:00\""); + assertThat(builder.of(localDateTime).toJson()).isEqualTo("\"2000-01-01T00:00:00\""); + assertThat(builder.of(localDate).toJson()).isEqualTo("\"2000-01-01\""); + assertThat(builder.of("hello".getBytes()).toJson()).isEqualTo("\"aGVsbG8=\""); + assertThat(builder.ofNull().toJson()).isEqualTo("null"); + } + + @Test + void testToJsonNested() { + Variant variant = + builder.object() + .add( + "list", + builder.array().add(builder.of("hello")).add(builder.of(1)).build()) + .add( + "object", + builder.object() + .add("ss", builder.of((short) 1)) + .add("ff", builder.of(10.0f)) + .build()) + .build(); + + String json = variant.toJson(); + assertThat(json) + .isEqualTo("{" + "\"list\":[\"hello\",1]," + "\"object\":{\"ff\":10.0,\"ss\":1}}"); + } + + @Test + void testVariantException() { + assertThatThrownBy(() -> new BinaryVariant(new byte[0], new byte[0])) + .isInstanceOf(RuntimeException.class) + .hasMessage("MALFORMED_VARIANT"); + + byte[] meta = new byte[1]; + meta[0] = (byte) 0x02; + assertThatThrownBy(() -> new BinaryVariant(new byte[1], meta)) + .isInstanceOf(RuntimeException.class) + .hasMessage("MALFORMED_VARIANT"); + + byte[] oversize = new byte[0xFFFFFF + 2]; + meta[0] = (byte) 0x01; + oversize[0] = (byte) 0x01; + assertThatThrownBy(() -> new BinaryVariant(oversize, meta)) + .isInstanceOf(RuntimeException.class) + .hasMessage("VARIANT_CONSTRUCTOR_SIZE_LIMIT"); + + assertThatThrownBy(() -> new BinaryVariant(new byte[1], oversize)) + .isInstanceOf(RuntimeException.class) + .hasMessage("VARIANT_CONSTRUCTOR_SIZE_LIMIT"); + } + + @Test + void testGetThrowException() { + Variant variant = builder.of(10f); + assertThatThrownBy(variant::getDouble) + .isInstanceOf(VariantTypeException.class) + .hasMessage("Expected type DOUBLE but got FLOAT"); + } +} diff --git a/flink-cdc-common-2.x/src/test/java/org/apache/flink/cdc/common/utils/SchemaMergingUtilsTest.java b/flink-cdc-common-2.x/src/test/java/org/apache/flink/cdc/common/utils/SchemaMergingUtilsTest.java new file mode 100644 index 00000000000..791c5fe7ea0 --- /dev/null +++ b/flink-cdc-common-2.x/src/test/java/org/apache/flink/cdc/common/utils/SchemaMergingUtilsTest.java @@ -0,0 +1,1228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.common.utils; + +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.tuple.Tuple4; +import org.apache.flink.cdc.common.data.DateData; +import org.apache.flink.cdc.common.data.DecimalData; +import org.apache.flink.cdc.common.data.LocalZonedTimestampData; +import org.apache.flink.cdc.common.data.TimeData; +import org.apache.flink.cdc.common.data.TimestampData; +import org.apache.flink.cdc.common.data.ZonedTimestampData; +import org.apache.flink.cdc.common.data.binary.BinaryStringData; +import org.apache.flink.cdc.common.event.AddColumnEvent; +import org.apache.flink.cdc.common.event.AlterColumnTypeEvent; +import org.apache.flink.cdc.common.event.CreateTableEvent; +import org.apache.flink.cdc.common.event.DropColumnEvent; +import org.apache.flink.cdc.common.event.SchemaChangeEvent; +import org.apache.flink.cdc.common.event.TableId; +import org.apache.flink.cdc.common.schema.Column; +import org.apache.flink.cdc.common.schema.Schema; +import org.apache.flink.cdc.common.types.DataType; +import org.apache.flink.cdc.common.types.DataTypes; +import org.apache.flink.cdc.common.types.DecimalType; +import org.apache.flink.cdc.common.types.LocalZonedTimestampType; +import org.apache.flink.cdc.common.types.TimestampType; +import org.apache.flink.cdc.common.types.ZonedTimestampType; + +import org.apache.flink.shaded.guava33.com.google.common.collect.ImmutableMap; + +import org.assertj.core.api.Assertions; +import org.junit.jupiter.api.Test; + +import javax.annotation.Nullable; + +import java.math.BigDecimal; +import java.sql.Timestamp; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalTime; +import java.time.ZoneId; +import java.time.ZonedDateTime; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Stream; + +import static org.apache.flink.cdc.common.types.DataTypes.DECIMAL; +import static org.apache.flink.cdc.common.types.DataTypes.VARCHAR; +import static org.apache.flink.cdc.common.utils.SchemaMergingUtils.coerceObject; +import static org.apache.flink.cdc.common.utils.SchemaMergingUtils.coerceRow; +import static org.apache.flink.cdc.common.utils.SchemaMergingUtils.getLeastCommonSchema; +import static org.apache.flink.cdc.common.utils.SchemaMergingUtils.getLeastCommonType; +import static org.apache.flink.cdc.common.utils.SchemaMergingUtils.getSchemaDifference; +import static org.apache.flink.cdc.common.utils.SchemaMergingUtils.isDataTypeCompatible; +import static org.apache.flink.cdc.common.utils.SchemaMergingUtils.isSchemaCompatible; + +/** A test for the {@link SchemaMergingUtils}. */ +class SchemaMergingUtilsTest { + + private static final TableId TABLE_ID = TableId.tableId("foo", "bar", "baz"); + + private static final DataType CHAR = DataTypes.CHAR(17); + private static final DataType VARCHAR = DataTypes.VARCHAR(17); + private static final DataType STRING = DataTypes.STRING(); + + private static final DataType BOOLEAN = DataTypes.BOOLEAN(); + private static final DataType BINARY = DataTypes.BINARY(17); + private static final DataType VARBINARY = DataTypes.VARBINARY(17); + private static final DataType SMALLINT = DataTypes.SMALLINT(); + private static final DataType TINYINT = DataTypes.TINYINT(); + private static final DataType INT = DataTypes.INT(); + private static final DataType BIGINT = DataTypes.BIGINT(); + private static final DataType DECIMAL = + DECIMAL(DecimalType.MAX_PRECISION, DecimalType.DEFAULT_SCALE); + private static final DataType FLOAT = DataTypes.FLOAT(); + private static final DataType DOUBLE = DataTypes.DOUBLE(); + + private static final DataType TIMESTAMP_TZ = + DataTypes.TIMESTAMP_TZ(ZonedTimestampType.MAX_PRECISION); + private static final DataType TIMESTAMP_LTZ = + DataTypes.TIMESTAMP_LTZ(LocalZonedTimestampType.MAX_PRECISION); + private static final DataType TIMESTAMP = DataTypes.TIMESTAMP(TimestampType.MAX_PRECISION); + private static final DataType DATE = DataTypes.DATE(); + private static final DataType TIME = DataTypes.TIME(); + + private static final DataType ROW = DataTypes.ROW(INT, STRING); + private static final DataType ARRAY = DataTypes.ARRAY(STRING); + private static final DataType MAP = DataTypes.MAP(INT, STRING); + private static final DataType VARIANT = DataTypes.VARIANT(); + + private static final List ALL_TYPES = + Arrays.asList( + // Binary types + STRING, + CHAR, + VARCHAR, + BINARY, + VARBINARY, + // Exact numeric types + TINYINT, + SMALLINT, + INT, + BIGINT, + DECIMAL, + // Inexact numeric types + FLOAT, + DOUBLE, + // Date and time types + TIMESTAMP, + TIMESTAMP_LTZ, + TIMESTAMP_TZ, + TIME, + // Complex types + ROW, + ARRAY, + MAP, + VARIANT); + + private static final Map DUMMY_OBJECTS = + ImmutableMap.of( + TINYINT, + (byte) 17, + SMALLINT, + (short) 17, + INT, + 17, + BIGINT, + 17L, + DECIMAL, + decOf(17), + FLOAT, + 17.0f, + DOUBLE, + 17.0); + + @Test + void testIsSchemaCompatible() { + Assertions.assertThat(isSchemaCompatible(null, of("id", BIGINT, "name", VARCHAR(17)))) + .as("test merging into an empty schema") + .isFalse(); + + Assertions.assertThat( + isSchemaCompatible( + of("id", BIGINT, "name", VARCHAR(17)), + of("id", BIGINT, "name", VARCHAR(17)))) + .as("test identical schema") + .isTrue(); + + Assertions.assertThat( + isSchemaCompatible( + of("id", BIGINT, "name", VARCHAR(17)), + of("name", VARCHAR(17), "id", BIGINT))) + .as("swapping sequence is ok") + .isTrue(); + + Assertions.assertThat( + isSchemaCompatible(of("id", BIGINT, "name", VARCHAR(17)), of("id", BIGINT))) + .as("test a wider upcoming schema") + .isTrue(); + + Assertions.assertThat( + isSchemaCompatible(of("id", BIGINT), of("id", BIGINT, "name", VARCHAR(17)))) + .as("test a narrower upcoming schema") + .isFalse(); + + Assertions.assertThat( + isSchemaCompatible( + of("id", BIGINT, "name", STRING), + of("id", BIGINT, "name", VARCHAR(17)))) + .as("test a wider typed upcoming schema") + .isTrue(); + + Assertions.assertThat( + isSchemaCompatible( + of("id", BIGINT, "name", VARCHAR(17)), + of("id", BIGINT, "name", STRING))) + .as("test a narrower typed upcoming schema") + .isFalse(); + + Stream.of(TINYINT, SMALLINT, INT) + .forEach( + type -> + Assertions.assertThat( + isSchemaCompatible( + of("id", BIGINT, "number", BIGINT), + of("id", BIGINT, "number", type))) + .as("test fitting %s into BIGINT", type) + .isTrue()); + + Stream.of(TINYINT, SMALLINT, INT) + .forEach( + type -> + Assertions.assertThat( + isSchemaCompatible( + of("id", BIGINT, "number", type), + of("id", BIGINT, "number", BIGINT))) + .as("test fitting BIGINT into %s", type) + .isFalse()); + + Stream.of(TINYINT, SMALLINT, INT, BIGINT, DECIMAL, STRING) + .forEach( + type -> + Assertions.assertThat( + isSchemaCompatible( + of("id", BIGINT, "number", STRING), + of("id", BIGINT, "number", type))) + .as("test fitting %s into STRING", type) + .isTrue()); + + Stream.of(TINYINT, SMALLINT, INT, BIGINT, DECIMAL) + .forEach( + type -> + Assertions.assertThat( + isSchemaCompatible( + of("id", BIGINT, "number", type), + of("id", BIGINT, "number", STRING))) + .as("test fitting STRING into %s", type) + .isFalse()); + + Stream.of(FLOAT, DOUBLE, STRING) + .forEach( + type -> + Assertions.assertThat( + isSchemaCompatible( + of("id", BIGINT, "number", STRING), + of("id", BIGINT, "number", type))) + .as("test fitting %s into STRING", type) + .isTrue()); + + Stream.of(FLOAT, DOUBLE) + .forEach( + type -> + Assertions.assertThat( + isSchemaCompatible( + of("id", BIGINT, "number", type), + of("id", BIGINT, "number", STRING))) + .as("test fitting STRING into %s", type) + .isFalse()); + + Assertions.assertThat( + isSchemaCompatible( + of("id", BIGINT, "foo", INT), of("id", BIGINT, "bar", INT))) + .as("columns with different names") + .isFalse(); + } + + @Test + void testGetLeastCommonSchema() { + Assertions.assertThat(getLeastCommonSchema(null, of("id", BIGINT, "name", VARCHAR(17)))) + .as("test merging into an empty schema") + .isEqualTo(of("id", BIGINT, "name", VARCHAR(17))); + + Assertions.assertThat( + getLeastCommonSchema( + of("id", BIGINT, "name", VARCHAR(17)), + of("id", BIGINT, "name", VARCHAR(17)))) + .as("test identical schema") + .isEqualTo(of("id", BIGINT, "name", VARCHAR(17))); + + Assertions.assertThat( + getLeastCommonSchema( + of("id", BIGINT, "name", VARCHAR(17)), + of("name", VARCHAR(17), "id", BIGINT))) + .as("swapping sequence is ok") + .isEqualTo(of("id", BIGINT, "name", VARCHAR(17))); + + Assertions.assertThat( + getLeastCommonSchema( + of("id", BIGINT, "name", VARCHAR(17)), of("id", BIGINT))) + .as("test a wider upcoming schema") + .isEqualTo(of("id", BIGINT, "name", VARCHAR(17))); + + Assertions.assertThat( + getLeastCommonSchema( + of("id", BIGINT), of("id", BIGINT, "name", VARCHAR(17)))) + .as("test a narrower upcoming schema") + .isEqualTo(of("id", BIGINT, "name", VARCHAR(17))); + + Assertions.assertThat( + getLeastCommonSchema( + of("id", BIGINT, "name", STRING), + of("id", BIGINT, "name", VARCHAR(17)))) + .as("test a wider typed upcoming schema") + .isEqualTo(of("id", BIGINT, "name", STRING)); + + Assertions.assertThat( + getLeastCommonSchema( + of("id", BIGINT, "name", VARCHAR(17)), + of("id", BIGINT, "name", STRING))) + .as("test a narrower typed upcoming schema") + .isEqualTo(of("id", BIGINT, "name", STRING)); + + Stream.of(TINYINT, SMALLINT, INT) + .forEach( + type -> + Assertions.assertThat( + getLeastCommonSchema( + of("id", BIGINT, "number", BIGINT), + of("id", BIGINT, "number", type))) + .as("test fitting %s into BIGINT", type) + .isEqualTo(of("id", BIGINT, "number", BIGINT))); + + Stream.of(TINYINT, SMALLINT, INT) + .forEach( + type -> + Assertions.assertThat( + getLeastCommonSchema( + of("id", BIGINT, "number", type), + of("id", BIGINT, "number", BIGINT))) + .as("test fitting BIGINT into %s", type) + .isEqualTo(of("id", BIGINT, "number", BIGINT))); + + Stream.of(TINYINT, SMALLINT, INT, BIGINT, DECIMAL, STRING) + .forEach( + type -> + Assertions.assertThat( + getLeastCommonSchema( + of("id", BIGINT, "number", STRING), + of("id", BIGINT, "number", type))) + .as("test fitting %s into STRING", type) + .isEqualTo(of("id", BIGINT, "number", STRING))); + + Stream.of(TINYINT, SMALLINT, INT, BIGINT, DECIMAL, STRING) + .forEach( + type -> + Assertions.assertThat( + getLeastCommonSchema( + of("id", BIGINT, "number", type), + of("id", BIGINT, "number", STRING))) + .as("test fitting STRING into %s", type) + .isEqualTo(of("id", BIGINT, "number", STRING))); + + Assertions.assertThat( + getLeastCommonSchema( + of("id", BIGINT, "foo", INT), of("id", BIGINT, "bar", INT))) + .as("columns with different names") + .isEqualTo(of("id", BIGINT, "foo", INT, "bar", INT)); + + Assertions.assertThat( + getLeastCommonSchema( + of("id", BIGINT, "foo", INT, "baz", FLOAT), + of("id", BIGINT, "bar", INT, "baz", DOUBLE))) + .as("mixed schema differences") + .isEqualTo(of("id", BIGINT, "foo", INT, "baz", DOUBLE, "bar", INT)); + } + + @Test + void testGetSchemaDifference() { + Assertions.assertThat( + getSchemaDifference(TABLE_ID, null, of("id", BIGINT, "name", VARCHAR(17)))) + .as("test merging into an empty schema") + .containsExactly( + new CreateTableEvent(TABLE_ID, of("id", BIGINT, "name", VARCHAR(17)))); + + Assertions.assertThat( + getSchemaDifference( + TABLE_ID, + of("id", BIGINT, "name", VARCHAR(17)), + of("id", BIGINT, "name", VARCHAR(17)))) + .as("test identical schema") + .isEmpty(); + + Assertions.assertThat( + getSchemaDifference( + TABLE_ID, + of("id", BIGINT, "name", VARCHAR(17)), + of("name", VARCHAR(17), "id", BIGINT))) + .as("swapping sequence is ok") + .isEmpty(); + + Assertions.assertThat( + getSchemaDifference( + TABLE_ID, of("id", BIGINT), of("id", BIGINT, "name", VARCHAR(17)))) + .as("test a widening upcoming schema") + .containsExactly( + new AddColumnEvent( + TABLE_ID, + Collections.singletonList( + new AddColumnEvent.ColumnWithPosition( + Column.physicalColumn("name", VARCHAR(17)), + AddColumnEvent.ColumnPosition.AFTER, + "id")))); + + Assertions.assertThat( + getSchemaDifference( + TABLE_ID, of("id", BIGINT), of("name", VARCHAR(17), "id", BIGINT))) + .as("test a widening upcoming schema at first") + .containsExactly( + new AddColumnEvent( + TABLE_ID, + Collections.singletonList( + new AddColumnEvent.ColumnWithPosition( + Column.physicalColumn("name", VARCHAR(17)), + AddColumnEvent.ColumnPosition.FIRST, + null)))); + + Assertions.assertThat( + getSchemaDifference( + TABLE_ID, + of("id", BIGINT, "name", VARCHAR(17)), + of("id", BIGINT, "name", STRING))) + .as("test a type-widening typed upcoming schema") + .containsExactly( + new AlterColumnTypeEvent( + TABLE_ID, + Collections.singletonMap("name", STRING), + Collections.singletonMap("name", VARCHAR(17)))); + Assertions.assertThat( + getSchemaDifference( + TABLE_ID, + of("id", BIGINT, "name", STRING, "number", BIGINT), + of("id", BIGINT))) + .as("test remove id while add gentle") + .containsExactly(new DropColumnEvent(TABLE_ID, Arrays.asList("number", "name"))); + Assertions.assertThat( + getSchemaDifference( + TABLE_ID, + of("id", BIGINT, "name", STRING, "number", BIGINT), + of("id", BIGINT, "name", STRING, "gentle", STRING))) + .as("test remove id while add gentle") + .containsExactly( + new AddColumnEvent( + TABLE_ID, + Collections.singletonList( + new AddColumnEvent.ColumnWithPosition( + Column.physicalColumn("gentle", STRING), + AddColumnEvent.ColumnPosition.AFTER, + "name"))), + new DropColumnEvent(TABLE_ID, Collections.singletonList("number"))); + Stream.of(TINYINT, SMALLINT, INT) + .forEach( + type -> + Assertions.assertThat( + getSchemaDifference( + TABLE_ID, + of("id", BIGINT, "number", type), + of("id", BIGINT, "number", BIGINT))) + .as("test escalating %s to BIGINT", type) + .containsExactly( + new AlterColumnTypeEvent( + TABLE_ID, + Collections.singletonMap("number", BIGINT), + Collections.singletonMap("number", type)))); + + Stream.of(TINYINT, SMALLINT, INT, BIGINT, DECIMAL, FLOAT) + .forEach( + type -> + Assertions.assertThat( + getSchemaDifference( + TABLE_ID, + of("id", BIGINT, "number", type), + of("id", BIGINT, "number", DOUBLE))) + .as("test escalating %s to DOUBLE", type) + .containsExactly( + new AlterColumnTypeEvent( + TABLE_ID, + Collections.singletonMap("number", DOUBLE), + Collections.singletonMap("number", type)))); + + Assertions.assertThat( + getSchemaDifference( + TABLE_ID, + of("id", BIGINT, "foo", INT, "baz", FLOAT), + of("id", BIGINT, "foo", BIGINT, "bar", INT, "baz", DOUBLE))) + .as("mixed schema differences") + .containsExactly( + new AddColumnEvent( + TABLE_ID, + Collections.singletonList( + new AddColumnEvent.ColumnWithPosition( + Column.physicalColumn("bar", INT), + AddColumnEvent.ColumnPosition.AFTER, + "foo"))), + new AlterColumnTypeEvent( + TABLE_ID, + ImmutableMap.of("foo", BIGINT, "baz", DOUBLE), + ImmutableMap.of("foo", INT, "baz", FLOAT))); + } + + @Test + void testMergeAndDiff() { + Assertions.assertThat(mergeAndDiff(null, of("id", BIGINT, "name", VARCHAR(17)))) + .as("test merging into an empty schema") + .containsExactly( + new CreateTableEvent(TABLE_ID, of("id", BIGINT, "name", VARCHAR(17)))); + + Assertions.assertThat( + mergeAndDiff( + of("id", BIGINT, "name", VARCHAR(17)), + of("id", BIGINT, "name", VARCHAR(17)))) + .as("test identical schema") + .isEmpty(); + + Assertions.assertThat( + mergeAndDiff( + of("id", BIGINT, "name", VARCHAR(17)), + of("name", VARCHAR(17), "id", BIGINT))) + .as("swapping sequence is ok") + .isEmpty(); + + Assertions.assertThat(mergeAndDiff(of("id", BIGINT, "name", VARCHAR(17)), of("id", BIGINT))) + .as("test a wider upcoming schema") + .isEmpty(); + + Assertions.assertThat(mergeAndDiff(of("id", BIGINT), of("id", BIGINT, "name", VARCHAR(17)))) + .as("test a narrower upcoming schema") + .containsExactly( + new AddColumnEvent( + TABLE_ID, + Collections.singletonList( + new AddColumnEvent.ColumnWithPosition( + Column.physicalColumn("name", VARCHAR(17)), + AddColumnEvent.ColumnPosition.AFTER, + "id")))); + + Assertions.assertThat( + mergeAndDiff( + of("id", BIGINT, "name", STRING), + of("id", BIGINT, "name", VARCHAR(17)))) + .as("test a wider typed upcoming schema") + .isEmpty(); + + Assertions.assertThat( + mergeAndDiff( + of("id", BIGINT, "name", VARCHAR(17)), + of("id", BIGINT, "name", STRING))) + .as("test a narrower typed upcoming schema") + .containsExactly( + new AlterColumnTypeEvent( + TABLE_ID, + Collections.singletonMap("name", STRING), + Collections.singletonMap("name", VARCHAR(17)))); + + Stream.of(TINYINT, SMALLINT, INT) + .forEach( + type -> + Assertions.assertThat( + mergeAndDiff( + of("id", BIGINT, "number", BIGINT), + of("id", BIGINT, "number", type))) + .as("test fitting %s into BIGINT", type) + .isEmpty()); + + Stream.of(TINYINT, SMALLINT, INT) + .forEach( + type -> + Assertions.assertThat( + mergeAndDiff( + of("id", BIGINT, "number", type), + of("id", BIGINT, "number", BIGINT))) + .as("test fitting BIGINT into %s", type) + .containsExactly( + new AlterColumnTypeEvent( + TABLE_ID, + Collections.singletonMap("number", BIGINT), + Collections.singletonMap("number", type)))); + + Stream.of(TINYINT, SMALLINT, INT, BIGINT, DECIMAL, STRING) + .forEach( + type -> + Assertions.assertThat( + mergeAndDiff( + of("id", BIGINT, "number", STRING), + of("id", BIGINT, "number", type))) + .as("test fitting %s into STRING", type) + .isEmpty()); + + Stream.of(TINYINT, SMALLINT, INT, BIGINT, DECIMAL) + .forEach( + type -> + Assertions.assertThat( + mergeAndDiff( + of("id", BIGINT, "number", type), + of("id", BIGINT, "number", STRING))) + .as("test fitting STRING into %s", type) + .containsExactly( + new AlterColumnTypeEvent( + TABLE_ID, + Collections.singletonMap("number", STRING), + Collections.singletonMap("number", type)))); + + Assertions.assertThat( + mergeAndDiff( + of("id", BIGINT, "foo", INT, "baz", FLOAT), + of("id", BIGINT, "bar", INT, "baz", DOUBLE))) + .as("mixed schema differences") + .containsExactly( + new AddColumnEvent( + TABLE_ID, + Collections.singletonList( + new AddColumnEvent.ColumnWithPosition( + Column.physicalColumn("bar", INT), + AddColumnEvent.ColumnPosition.AFTER, + "baz"))), + new AlterColumnTypeEvent( + TABLE_ID, + Collections.singletonMap("baz", DOUBLE), + Collections.singletonMap("baz", FLOAT))); + } + + @Test + void testIsDataTypeCompatible() { + List> viableConversions = + Arrays.asList( + Tuple2.of(CHAR, STRING), + Tuple2.of(VARCHAR, STRING), + Tuple2.of(BOOLEAN, STRING), + Tuple2.of(BINARY, STRING), + Tuple2.of(DOUBLE, STRING), + Tuple2.of(FLOAT, STRING), + Tuple2.of(DECIMAL, STRING), + Tuple2.of(BIGINT, STRING), + Tuple2.of(INT, STRING), + Tuple2.of(SMALLINT, STRING), + Tuple2.of(TINYINT, STRING), + Tuple2.of(TIMESTAMP_TZ, STRING), + Tuple2.of(TIMESTAMP_LTZ, STRING), + Tuple2.of(TIMESTAMP, STRING), + Tuple2.of(DATE, STRING), + Tuple2.of(TIME, STRING), + Tuple2.of(ROW, STRING), + Tuple2.of(ARRAY, STRING), + Tuple2.of(MAP, STRING), + Tuple2.of(TINYINT, SMALLINT), + Tuple2.of(SMALLINT, INT), + Tuple2.of(INT, BIGINT), + Tuple2.of(BIGINT, DECIMAL), + Tuple2.of(DECIMAL, STRING), + Tuple2.of(FLOAT, DOUBLE), + Tuple2.of(DATE, TIMESTAMP), + Tuple2.of(TIMESTAMP, TIMESTAMP_LTZ), + Tuple2.of(TIMESTAMP_LTZ, TIMESTAMP_TZ)); + + List> infeasibleConversions = + Arrays.asList( + Tuple2.of(CHAR, BOOLEAN), + Tuple2.of(BOOLEAN, BINARY), + Tuple2.of(BINARY, DOUBLE), + Tuple2.of(DOUBLE, TIMESTAMP_TZ), + Tuple2.of(TIMESTAMP_TZ, TIME), + Tuple2.of(TIME, ROW), + Tuple2.of(ROW, ARRAY), + Tuple2.of(ARRAY, MAP)); + + viableConversions.forEach( + conv -> + Assertions.assertThat(isDataTypeCompatible(conv.f1, conv.f0)) + .as("test fitting %s into %s", conv.f0, conv.f1) + .isTrue()); + + viableConversions.forEach( + conv -> + Assertions.assertThat(isDataTypeCompatible(conv.f0, conv.f1)) + .as("test fitting %s into %s", conv.f1, conv.f0) + .isFalse()); + + infeasibleConversions.forEach( + conv -> + Assertions.assertThat(isDataTypeCompatible(conv.f1, conv.f0)) + .as("test fitting %s into %s", conv.f0, conv.f1) + .isFalse()); + + infeasibleConversions.forEach( + conv -> + Assertions.assertThat(isDataTypeCompatible(conv.f0, conv.f1)) + .as("test fitting %s into %s", conv.f1, conv.f0) + .isFalse()); + } + + @Test + void testCoerceObject() { + Stream> conversionExpects = + Stream.of( + // From TINYINT + Tuple4.of(TINYINT, (byte) 0, TINYINT, (byte) 0), + Tuple4.of(TINYINT, (byte) 1, SMALLINT, (short) 1), + Tuple4.of(TINYINT, (byte) 2, INT, 2), + Tuple4.of(TINYINT, (byte) 3, BIGINT, 3L), + Tuple4.of(TINYINT, (byte) 4, DECIMAL, decOf(4)), + Tuple4.of(TINYINT, (byte) 5, FLOAT, 5.0f), + Tuple4.of(TINYINT, (byte) 6, DOUBLE, 6.0), + Tuple4.of(TINYINT, (byte) 7, STRING, binStrOf("7")), + + // From SMALLINT + Tuple4.of(SMALLINT, (short) 1, SMALLINT, (short) 1), + Tuple4.of(SMALLINT, (short) 2, INT, 2), + Tuple4.of(SMALLINT, (short) 3, BIGINT, 3L), + Tuple4.of(SMALLINT, (short) 4, DECIMAL, decOf(4)), + Tuple4.of(SMALLINT, (short) 5, FLOAT, 5.0f), + Tuple4.of(SMALLINT, (short) 6, DOUBLE, 6.0), + Tuple4.of(SMALLINT, (short) 7, STRING, binStrOf("7")), + + // From INT + Tuple4.of(INT, 2, INT, 2), + Tuple4.of(INT, 3, BIGINT, 3L), + Tuple4.of(INT, 4, DECIMAL, decOf(4)), + Tuple4.of(INT, 5, FLOAT, 5.0f), + Tuple4.of(INT, 6, DOUBLE, 6.0), + Tuple4.of(INT, 7, STRING, binStrOf("7")), + + // From BIGINT + Tuple4.of(BIGINT, 3L, BIGINT, 3L), + Tuple4.of(BIGINT, 4L, DECIMAL, decOf(4)), + Tuple4.of(BIGINT, 5L, FLOAT, 5.0f), + Tuple4.of(BIGINT, 6L, DOUBLE, 6.0), + Tuple4.of(BIGINT, 7L, STRING, binStrOf("7")), + + // From DECIMAL + Tuple4.of(DECIMAL, decOf(4), DECIMAL, decOf(4)), + Tuple4.of(DECIMAL, decOf(5), FLOAT, 5.0f), + Tuple4.of(DECIMAL, decOf(6), DOUBLE, 6.0), + Tuple4.of(DECIMAL, decOf(7), STRING, binStrOf("7")), + + // From FLOAT + Tuple4.of(FLOAT, 5.0f, FLOAT, 5.0f), + Tuple4.of(FLOAT, 6.0f, DOUBLE, 6.0), + Tuple4.of(FLOAT, 7.0f, STRING, binStrOf("7.0")), + + // From DOUBLE + Tuple4.of(DOUBLE, 6.0f, DOUBLE, 6.0), + Tuple4.of(DOUBLE, 7.0f, STRING, binStrOf("7.0")), + + // From STRING + Tuple4.of(STRING, binStrOf("AtoZ"), STRING, binStrOf("AtoZ")), + Tuple4.of(STRING, binStrOf("lie"), STRING, binStrOf("lie")), + + // From CHAR + Tuple4.of( + CHAR, binStrOf("les miserables"), CHAR, binStrOf("les miserables")), + Tuple4.of(CHAR, binStrOf("notre dame"), STRING, binStrOf("notre dame")), + + // From Binary + Tuple4.of(BINARY, binOf("les miserables"), BINARY, binOf("les miserables")), + Tuple4.of( + BINARY, binOf("notre dame"), STRING, binStrOf("bm90cmUgZGFtZQ==")), + + // From BOOLEAN + Tuple4.of(BOOLEAN, true, BOOLEAN, true), + Tuple4.of(BOOLEAN, false, BOOLEAN, false), + Tuple4.of(BOOLEAN, true, STRING, binStrOf("true")), + Tuple4.of(BOOLEAN, false, STRING, binStrOf("false")), + + // From DATE + Tuple4.of(DATE, dateOf(2017, 1, 1), DATE, dateOf(2017, 1, 1)), + Tuple4.of(DATE, dateOf(2018, 2, 2), TIMESTAMP, tsOf("2018", "02", "02")), + Tuple4.of( + DATE, + dateOf(2019, 3, 3), + TIMESTAMP_LTZ, + ltzTsOf("2019", "03", "03")), + Tuple4.of( + DATE, dateOf(2020, 4, 4), TIMESTAMP_TZ, zTsOf("2020", "04", "04")), + Tuple4.of(DATE, dateOf(2021, 5, 5), STRING, binStrOf("2021-05-05")), + + // From TIME + Tuple4.of(TIME, timeOf(21, 48, 25), TIME, timeOf(21, 48, 25)), + Tuple4.of(TIME, timeOf(21, 48, 25), STRING, binStrOf("21:48:25")), + + // From TIMESTAMP + Tuple4.of( + TIMESTAMP, + tsOf("2022", "06", "06"), + TIMESTAMP, + tsOf("2022", "06", "06")), + Tuple4.of( + TIMESTAMP, + tsOf("2023", "07", "07"), + TIMESTAMP_LTZ, + ltzTsOf("2023", "07", "07")), + Tuple4.of( + TIMESTAMP, + tsOf("2024", "08", "08"), + TIMESTAMP_TZ, + zTsOf("2024", "08", "08")), + Tuple4.of( + TIMESTAMP, + tsOf("2025", "09", "09"), + STRING, + binStrOf("2025-09-09T00:00")), + + // From TIMESTAMP_LTZ + Tuple4.of( + TIMESTAMP_LTZ, + ltzTsOf("2026", "10", "10"), + TIMESTAMP_LTZ, + ltzTsOf("2026", "10", "10")), + Tuple4.of( + TIMESTAMP_LTZ, + ltzTsOf("2027", "11", "11"), + TIMESTAMP_TZ, + zTsOf("2027", "11", "11")), + Tuple4.of( + TIMESTAMP_LTZ, + ltzTsOf("2028", "12", "12"), + STRING, + binStrOf("2028-12-12T00:00")), + + // From TIMESTAMP_TZ + Tuple4.of( + TIMESTAMP_TZ, + zTsOf("2018", "01", "01"), + TIMESTAMP_TZ, + zTsOf("2018", "01", "01")), + Tuple4.of( + TIMESTAMP_TZ, + zTsOf("2019", "02", "02"), + STRING, + binStrOf("2019-02-02T00:00:00Z"))); + + conversionExpects.forEach( + rule -> + Assertions.assertThat(coerceObject("UTC", rule.f1, rule.f0, rule.f2)) + .as("Try coercing %s (%s) to %s type", rule.f1, rule.f0, rule.f2) + .isEqualTo(rule.f3)); + } + + @Test + void testCoerceRow() { + Assertions.assertThat( + coerceRow( + "UTC", + of("id", BIGINT, "name", VARCHAR(17)), + of("id", BIGINT, "name", VARCHAR(17)), + Arrays.asList(2L, binStrOf("Bob")))) + .as("test identical schema") + .containsExactly(2L, binStrOf("Bob")); + + Assertions.assertThat( + coerceRow( + "UTC", + of("id", BIGINT, "name", VARCHAR(17)), + of("name", VARCHAR(17), "id", BIGINT), + Arrays.asList(binStrOf("Cecily"), 3L))) + .as("swapping sequence is ok") + .containsExactly(3L, binStrOf("Cecily")); + + Assertions.assertThat( + coerceRow( + "UTC", + of("id", BIGINT, "name", VARCHAR(17)), + of("id", BIGINT), + Collections.singletonList(4L))) + .as("test a wider upcoming schema") + .containsExactly(4L, null); + + Assertions.assertThat( + coerceRow( + "UTC", + of("id", BIGINT, "name", STRING), + of("id", BIGINT, "name", VARCHAR(17)), + Arrays.asList(4L, "Derrida"))) + .as("test a wider typed upcoming schema") + .containsExactly(4L, binStrOf("Derrida")); + + Stream.of(TINYINT, SMALLINT, INT) + .forEach( + type -> + Assertions.assertThat( + coerceRow( + "UTC", + of("id", BIGINT, "number", BIGINT), + of("id", BIGINT, "number", type), + Arrays.asList(5L, DUMMY_OBJECTS.get(type)))) + .as("test fitting %s into BIGINT", type) + .containsExactly(5L, 17L)); + + Stream.of(TINYINT, SMALLINT, INT, BIGINT, DECIMAL, FLOAT) + .forEach( + type -> + Assertions.assertThat( + coerceRow( + "UTC", + of("id", BIGINT, "number", DOUBLE), + of("id", BIGINT, "number", type), + Arrays.asList(6L, DUMMY_OBJECTS.get(type)))) + .as("test fitting %s into DOUBLE", type) + .containsExactly(6L, 17.0)); + + // Test coercing with NULL + Assertions.assertThat( + coerceRow( + "UTC", + of("id", BIGINT, "name", VARCHAR(17)), + of("id", BIGINT, "name", VARCHAR(17)), + Arrays.asList(2L, null))) + .as("test identical schema") + .containsExactly(2L, null); + + Assertions.assertThat( + coerceRow( + "UTC", + of("id", BIGINT, "name", VARCHAR(17)), + of("name", VARCHAR(17), "id", BIGINT), + Arrays.asList(null, 3L))) + .as("swapping sequence is ok") + .containsExactly(3L, null); + + Assertions.assertThat( + coerceRow( + "UTC", + of("id", BIGINT, "name", VARCHAR(17)), + of("id", BIGINT), + Collections.singletonList(4L))) + .as("test a wider upcoming schema") + .containsExactly(4L, null); + + Assertions.assertThat( + coerceRow( + "UTC", + of("id", BIGINT, "name", STRING), + of("id", BIGINT, "name", VARCHAR(17)), + Arrays.asList(4L, null))) + .as("test a wider typed upcoming schema") + .containsExactly(4L, null); + + Stream.of(TINYINT, SMALLINT, INT) + .forEach( + type -> + Assertions.assertThat( + coerceRow( + "UTC", + of("id", BIGINT, "number", BIGINT), + of("id", BIGINT, "number", type), + Arrays.asList(5L, null))) + .as("test fitting %s into BIGINT", type) + .containsExactly(5L, null)); + + Stream.of(TINYINT, SMALLINT, INT, BIGINT, DECIMAL, FLOAT) + .forEach( + type -> + Assertions.assertThat( + coerceRow( + "UTC", + of("id", BIGINT, "number", DOUBLE), + of("id", BIGINT, "number", type), + Arrays.asList(6L, null))) + .as("test fitting %s into DOUBLE", type) + .containsExactly(6L, null)); + } + + @Test + void testGetLeastCommonType() { + // To-be-merged types are: + // STRING, CHAR, VARCHAR, BINARY, VARBINARY, TINYINT, SMALLINT, INT, BIGINT, + // DECIMAL, FLOAT, DOUBLE, TIMESTAMP, TIMESTAMP_LTZ, TIMESTAMP_TZ, TIME, ROW, ARRAY, + // MAP + + assertTypeMergingVector( + STRING, + Arrays.asList( + STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING)); + + assertTypeMergingVector( + CHAR, + Arrays.asList( + STRING, CHAR, STRING, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING)); + + assertTypeMergingVector( + VARCHAR, + Arrays.asList( + STRING, STRING, VARCHAR, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING)); + + assertTypeMergingVector( + BINARY, + Arrays.asList( + STRING, STRING, STRING, BINARY, STRING, STRING, STRING, STRING, STRING, + STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING)); + + assertTypeMergingVector( + VARBINARY, + Arrays.asList( + STRING, STRING, STRING, STRING, VARBINARY, STRING, STRING, STRING, STRING, + STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING)); + + // 8-bit TINYINT could fit into FLOAT (24 sig bits) or DOUBLE (53 sig bits) + assertTypeMergingVector( + TINYINT, + Arrays.asList( + STRING, STRING, STRING, STRING, STRING, TINYINT, SMALLINT, INT, BIGINT, + DECIMAL, FLOAT, DOUBLE, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING)); + + // 16-bit SMALLINT could fit into FLOAT (24 sig bits) or DOUBLE (53 sig bits) + assertTypeMergingVector( + SMALLINT, + Arrays.asList( + STRING, STRING, STRING, STRING, STRING, SMALLINT, SMALLINT, INT, BIGINT, + DECIMAL, FLOAT, DOUBLE, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING)); + + // 32-bit INT could fit into DOUBLE (53 sig bits) + assertTypeMergingVector( + INT, + Arrays.asList( + STRING, STRING, STRING, STRING, STRING, INT, INT, INT, BIGINT, DECIMAL, + DOUBLE, DOUBLE, STRING, STRING, STRING, STRING, STRING, STRING, STRING, + STRING)); + + assertTypeMergingVector( + BIGINT, + Arrays.asList( + STRING, STRING, STRING, STRING, STRING, BIGINT, BIGINT, BIGINT, BIGINT, + DECIMAL, DOUBLE, DOUBLE, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING)); + + assertTypeMergingVector( + DECIMAL, + Arrays.asList( + STRING, STRING, STRING, STRING, STRING, DECIMAL, DECIMAL, DECIMAL, DECIMAL, + DECIMAL, STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING)); + + assertTypeMergingVector( + FLOAT, + Arrays.asList( + STRING, STRING, STRING, STRING, STRING, FLOAT, FLOAT, DOUBLE, DOUBLE, + STRING, FLOAT, DOUBLE, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING)); + + assertTypeMergingVector( + DOUBLE, + Arrays.asList( + STRING, STRING, STRING, STRING, STRING, DOUBLE, DOUBLE, DOUBLE, DOUBLE, + STRING, DOUBLE, DOUBLE, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING)); + + assertTypeMergingVector( + TIMESTAMP, + Arrays.asList( + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + TIMESTAMP, + TIMESTAMP_LTZ, + TIMESTAMP_TZ, + STRING, + STRING, + STRING, + STRING, + STRING)); + + assertTypeMergingVector( + TIMESTAMP_LTZ, + Arrays.asList( + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + TIMESTAMP_LTZ, + TIMESTAMP_LTZ, + TIMESTAMP_TZ, + STRING, + STRING, + STRING, + STRING, + STRING)); + + assertTypeMergingVector( + TIMESTAMP_TZ, + Arrays.asList( + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + STRING, + TIMESTAMP_TZ, + TIMESTAMP_TZ, + TIMESTAMP_TZ, + STRING, + STRING, + STRING, + STRING, + STRING)); + + assertTypeMergingVector( + TIME, + Arrays.asList( + STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING, STRING, STRING, STRING, STRING, TIME, STRING, STRING, + STRING, STRING)); + + assertTypeMergingVector( + ROW, + Arrays.asList( + STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING, STRING, STRING, STRING, STRING, STRING, ROW, STRING, STRING, + STRING)); + + assertTypeMergingVector( + ARRAY, + Arrays.asList( + STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, ARRAY, + STRING, STRING)); + + assertTypeMergingVector( + MAP, + Arrays.asList( + STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, MAP, + STRING)); + + assertTypeMergingVector( + VARIANT, + Arrays.asList( + STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, + STRING, VARIANT)); + } + + private static void assertTypeMergingVector(DataType incomingType, List resultTypes) { + Assertions.assertThat(ALL_TYPES) + .map(type -> getLeastCommonType(type, incomingType)) + .containsExactlyElementsOf(resultTypes) + // Flip LHS and RHS should emit same outputs + .map(type -> getLeastCommonType(incomingType, type)) + .containsExactlyElementsOf(resultTypes); + } + + // Some testing utility methods. + + private static List mergeAndDiff( + @Nullable Schema currentSchema, Schema upcomingSchema) { + Schema afterSchema = getLeastCommonSchema(currentSchema, upcomingSchema); + return getSchemaDifference(TABLE_ID, currentSchema, afterSchema); + } + + private static Schema of(Object... args) { + List argList = new ArrayList<>(Arrays.asList(args)); + Preconditions.checkState(argList.size() % 2 == 0); + Schema.Builder builder = Schema.newBuilder(); + while (!argList.isEmpty()) { + String colName = (String) argList.remove(0); + DataType colType = (DataType) argList.remove(0); + builder.physicalColumn(colName, colType); + } + return builder.build(); + } + + private static DateData dateOf(int year, int month, int dayOfMonth) { + return DateData.fromLocalDate(LocalDate.of(year, month, dayOfMonth)); + } + + private static TimeData timeOf(int hour, int minute, int second) { + return TimeData.fromLocalTime(LocalTime.of(hour, minute, second)); + } + + private static TimestampData tsOf(String year, String month, String dayOfMonth) { + return TimestampData.fromTimestamp( + Timestamp.valueOf(String.format("%s-%s-%s 00:00:00", year, month, dayOfMonth))); + } + + private static LocalZonedTimestampData ltzTsOf(String year, String month, String dayOfMonth) { + return LocalZonedTimestampData.fromEpochMillis( + Instant.parse(String.format("%s-%s-%sT00:00:00Z", year, month, dayOfMonth)) + .toEpochMilli()); + } + + private static ZonedTimestampData zTsOf(String year, String month, String dayOfMonth) { + return ZonedTimestampData.fromZonedDateTime( + ZonedDateTime.ofInstant( + Instant.parse(String.format("%s-%s-%sT00:00:00Z", year, month, dayOfMonth)), + ZoneId.of("UTC"))); + } + + private static DecimalData decOf(long value) { + return DecimalData.fromBigDecimal( + BigDecimal.valueOf(value), DecimalType.MAX_PRECISION, DecimalType.DEFAULT_SCALE); + } + + private static BinaryStringData binStrOf(String str) { + return BinaryStringData.fromString(str); + } + + private static byte[] binOf(String str) { + return str.getBytes(); + } +} diff --git a/flink-cdc-common-2.x/src/test/resources/log4j2-test.properties b/flink-cdc-common-2.x/src/test/resources/log4j2-test.properties new file mode 100644 index 00000000000..0d45bab8011 --- /dev/null +++ b/flink-cdc-common-2.x/src/test/resources/log4j2-test.properties @@ -0,0 +1,25 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +# Set root logger level to ERROR to not flood build logs +# set manually to INFO for debugging purposes +rootLogger.level = ERROR +rootLogger.appenderRef.test.ref = TestLogger + +appender.testlogger.name = TestLogger +appender.testlogger.type = CONSOLE +appender.testlogger.target = SYSTEM_ERR +appender.testlogger.layout.type = PatternLayout +appender.testlogger.layout.pattern = %-4r [%t] %-5p %c - %m%n diff --git a/flink-cdc-common/src/main/java/org/apache/flink/cdc/common/types/VariantType.java b/flink-cdc-common/src/main/java/org/apache/flink/cdc/common/types/VariantType.java index d07907c44b9..70ab05a7d72 100644 --- a/flink-cdc-common/src/main/java/org/apache/flink/cdc/common/types/VariantType.java +++ b/flink-cdc-common/src/main/java/org/apache/flink/cdc/common/types/VariantType.java @@ -17,7 +17,7 @@ package org.apache.flink.cdc.common.types; -import org.apache.flink.annotation.PublicEvolving; +import org.apache.flink.cdc.common.annotation.PublicEvolving; import java.util.Collections; import java.util.List; diff --git a/flink-cdc-common/src/main/java/org/apache/flink/cdc/common/utils/ThreadLocalCache.java b/flink-cdc-common/src/main/java/org/apache/flink/cdc/common/utils/ThreadLocalCache.java index f4b8101ed60..03aef691671 100644 --- a/flink-cdc-common/src/main/java/org/apache/flink/cdc/common/utils/ThreadLocalCache.java +++ b/flink-cdc-common/src/main/java/org/apache/flink/cdc/common/utils/ThreadLocalCache.java @@ -17,7 +17,7 @@ package org.apache.flink.cdc.common.utils; -import org.apache.flink.annotation.Internal; +import org.apache.flink.cdc.common.annotation.Internal; import java.util.LinkedHashMap; import java.util.Map; diff --git a/pom.xml b/pom.xml index 61d657a91ce..b740dd7cc89 100644 --- a/pom.xml +++ b/pom.xml @@ -35,6 +35,7 @@ limitations under the License. flink-cdc-cli flink-cdc-common + flink-cdc-common-2.x flink-cdc-composer flink-cdc-dist flink-cdc-connect @@ -67,8 +68,13 @@ limitations under the License. true - 1.20.3 + 1.20.3 + 2.2.0 + ${flink.1.x.version} 1.20 + 31.1-jre-17.0 + 33.4.0-jre-20.0 + ${flink.1.x.shaded.guava.version} 17.0 1.9.8.Final 3.2.0 @@ -289,7 +295,7 @@ limitations under the License. org.apache.flink flink-shaded-guava - 31.1-jre-${flink.shaded.version} + ${shaded.guava.version}