Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ static CloseableIterator<FilteredColumnarBatch> transformLogicalData(
}

ColumnarBatch data = filteredBatch.getData();
if (!data.getSchema().equals(tableSchema)) {
if (!data.getSchema().equivalentIgnoreCollations(tableSchema)) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I left a similar comment on the other PR, but how do we know when/where we should use this instead? How can we be sure all the instances of comparison we do throughout the code-base won't now be an issue? concerned there might be somewhere that would fail only if a test had collations in the schema, which the majority of existing tests won't

It seems like we need to audit everywhere we might compare data types or schemas?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm worried we do this type of comparison all over..

throw dataSchemaMismatch(tablePath, tableSchema, data.getSchema());
}
for (String partitionColName : partitionColNames) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,8 @@ private void validateLiteralType(StructField field, Literal literal) {
// Variant stats in JSON are Z85 encoded strings, all other stats should match the field type
DataType expectedLiteralType =
field.getDataType() instanceof VariantType ? StringType.STRING : field.getDataType();
if (literal.getDataType() == null || !literal.getDataType().equals(expectedLiteralType)) {
if (literal.getDataType() == null
|| !literal.getDataType().equivalentIgnoreCollations(expectedLiteralType)) {
throw DeltaErrors.statsTypeMismatch(
field.getName(), expectedLiteralType, literal.getDataType());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,26 @@ public boolean equivalent(DataType dataType) {
&& ((ArrayType) dataType).getElementType().equivalent(getElementType());
}

/**
* Are the data types same? The collations could be different.
*
* @param dataType
* @return
*/
@Override
public boolean equivalentIgnoreCollations(DataType dataType) {
if (this == dataType) {
return true;
}
if (dataType == null || getClass() != dataType.getClass()) {
return false;
}
ArrayType arrayType = (ArrayType) dataType;
return (elementField == null && arrayType.elementField == null)
|| (elementField != null
&& elementField.equivalentIgnoreCollations(arrayType.elementField));
}

@Override
public boolean isNested() {
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,16 @@ public boolean equivalent(DataType dataType) {
return equals(dataType);
}

/**
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it worth having this + the other equivalent method or might they satisfy the same goal?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think equivalentIgnoreCollations is needed because of the cases like https://github.com/delta-io/delta/pull/5357/files#diff-c9fd0f3e881617ea0f2439a29c32a35e8c32fbcdda229105be76ece4001819acR200. Here we don't to ignore names and metadata.

* Are the data types same? The collations could be different.
*
* @param dataType
* @return
*/
public boolean equivalentIgnoreCollations(DataType dataType) {
return equals(dataType);
}

/**
* Returns true iff this data is a nested data type (it logically parameterized by other types).
*
Expand Down
21 changes: 21 additions & 0 deletions kernel/kernel-api/src/main/java/io/delta/kernel/types/MapType.java
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,27 @@ public boolean equivalent(DataType dataType) {
&& ((MapType) dataType).isValueContainsNull() == isValueContainsNull();
}

/**
* Are the data types same? The collations could be different.
*
* @param dataType
* @return
*/
@Override
public boolean equivalentIgnoreCollations(DataType dataType) {
if (this == dataType) {
return true;
}
if (dataType == null || getClass() != dataType.getClass()) {
return false;
}
MapType mapType = (MapType) dataType;
return ((keyField == null && mapType.keyField == null)
|| (keyField != null && keyField.equivalentIgnoreCollations(mapType.keyField)))
&& ((valueField == null && mapType.valueField == null)
|| (valueField != null && valueField.equivalentIgnoreCollations(mapType.valueField)));
}

@Override
public boolean equals(Object o) {
if (this == o) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,17 @@ public CollationIdentifier getCollationIdentifier() {
return collationIdentifier;
}

/**
* Are the data types same? The collations could be different.
*
* @param dataType
* @return
*/
@Override
public boolean equivalentIgnoreCollations(DataType dataType) {
return dataType instanceof StringType;
}

@Override
public boolean equals(Object o) {
if (!(o instanceof StringType)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,30 @@ public boolean equals(Object o) {
&& Objects.equals(typeChanges, that.typeChanges);
}

/** @return whether the struct fields are equal, ignoring collations */
public boolean equivalentIgnoreCollations(StructField other) {
if (this == other) {
return true;
}
if (other == null) {
return false;
}
// Compare metadata while ignoring collation metadata differences
FieldMetadata metadataWithoutCollations =
new FieldMetadata.Builder().fromMetadata(metadata).remove(COLLATIONS_METADATA_KEY).build();
FieldMetadata otherMetadataWithoutCollations =
new FieldMetadata.Builder()
.fromMetadata(other.metadata)
.remove(COLLATIONS_METADATA_KEY)
.build();

return nullable == other.nullable
&& name.equals(other.name)
&& dataType.equivalentIgnoreCollations(other.dataType)
&& metadataWithoutCollations.equals(otherMetadataWithoutCollations)
&& Objects.equals(typeChanges, other.typeChanges);
}

@Override
public int hashCode() {
return Objects.hash(name, dataType, nullable, metadata, typeChanges);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,28 @@ public boolean equivalent(DataType dataType) {
.allMatch(result -> result);
}

/**
* Are the data types same? The collations could be different.
*
* @param dataType
* @return
*/
@Override
public boolean equivalentIgnoreCollations(DataType dataType) {
if (this == dataType) {
return true;
}
if (dataType == null || getClass() != dataType.getClass()) {
return false;
}
StructType structType = (StructType) dataType;
return this.length() == structType.length()
&& fieldNames.equals(structType.fieldNames)
&& IntStream.range(0, this.length())
.mapToObj(i -> this.at(i).equivalentIgnoreCollations(structType.at(i)))
.allMatch(result -> result);
}

@Override
public boolean isNested() {
return true;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
/*
* Copyright (2025) The Delta Lake Project Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.delta.kernel.types

import org.scalatest.funsuite.AnyFunSuite

class DataTypeSuite extends AnyFunSuite {

test("test equivalentIgnoreCollations") {
val utf8LcaseString = new StringType("SPARK.UTF8_LCASE")
val unicodeString = new StringType("ICU.UNICODE")

val testCases = Seq(
(StringType.STRING, StringType.STRING, true),
(StringType.STRING, utf8LcaseString, true),
(IntegerType.INTEGER, StringType.STRING, false),
(utf8LcaseString, unicodeString, true),
(
new ArrayType(StringType.STRING, true),
new ArrayType(utf8LcaseString, true),
true),
(
new ArrayType(unicodeString, false),
new ArrayType(StringType.STRING, false),
true),
(
new ArrayType(StringType.STRING, true),
new ArrayType(utf8LcaseString, false),
false),
(
new MapType(StringType.STRING, IntegerType.INTEGER, false),
new MapType(utf8LcaseString, IntegerType.INTEGER, false),
true),
(
new MapType(unicodeString, IntegerType.INTEGER, false),
new MapType(utf8LcaseString, IntegerType.INTEGER, false),
true),
(
new MapType(unicodeString, IntegerType.INTEGER, false),
new MapType(utf8LcaseString, IntegerType.INTEGER, true),
false),
(
new StructType()
.add("name", StringType.STRING)
.add("age", IntegerType.INTEGER),
new StructType()
.add("name", utf8LcaseString)
.add("age", IntegerType.INTEGER),
true),
(
new StructType()
.add("name", StringType.STRING)
.add("details", new StructType().add("address", StringType.STRING)),
new StructType()
.add("name", unicodeString)
.add("details", new StructType().add("address", utf8LcaseString)),
true),
(
new StructType()
.add("c1", new ArrayType(unicodeString, true))
.add("c2", new MapType(StringType.STRING, utf8LcaseString, false)),
new StructType()
.add("c1", new ArrayType(StringType.STRING, true))
.add("c2", new MapType(utf8LcaseString, unicodeString, false)),
true),
(
new StructType()
.add("c1", new ArrayType(unicodeString, false))
.add("c2", new MapType(StringType.STRING, utf8LcaseString, false)),
new StructType()
.add("c1", new ArrayType(StringType.STRING, true))
.add("c2", new MapType(utf8LcaseString, unicodeString, false)),
false),
(
new StructType()
.add("c1", new ArrayType(IntegerType.INTEGER, true))
.add("c2", new MapType(StringType.STRING, utf8LcaseString, false)),
new StructType()
.add("c1", new ArrayType(StringType.STRING, true))
.add("c2", new MapType(utf8LcaseString, unicodeString, false)),
false),
(
new ArrayType(
new StructType().add("c1", new MapType(utf8LcaseString, StringType.STRING, true), true),
true),
new ArrayType(
new StructType().add("c1", new MapType(StringType.STRING, utf8LcaseString, true), true),
true),
true),
(
new ArrayType(
new StructType().add("c1", new MapType(utf8LcaseString, StringType.STRING, true), true),
true),
new ArrayType(
new StructType().add("c2", new MapType(StringType.STRING, utf8LcaseString, true), true),
true),
false),
(
new ArrayType(
new StructType().add("c1", new MapType(utf8LcaseString, StringType.STRING, true), false),
true),
new ArrayType(
new StructType().add("c1", new MapType(StringType.STRING, utf8LcaseString, true), true),
true),
false),
(
new MapType(
new StructType().add("c1", utf8LcaseString),
new ArrayType(utf8LcaseString, false),
true),
new MapType(
new StructType().add("c1", StringType.STRING),
new ArrayType(utf8LcaseString, false),
true),
true),
(
new MapType(
new StructType().add("c1", utf8LcaseString),
new ArrayType(utf8LcaseString, false),
false),
new MapType(
new StructType().add("c1", StringType.STRING),
new ArrayType(utf8LcaseString, false),
true),
false),
(
new MapType(new StructType().add("c1", utf8LcaseString), StringType.STRING, false),
new MapType(new StructType().add("c1", StringType.STRING), utf8LcaseString, true),
false))

testCases.foreach { case (dt1, dt2, expected) =>
assert(dt1.equivalentIgnoreCollations(dt2) == expected)
}
}
}
Loading
Loading