Merge pull request #156 from caraml-dev/support-array-datatype

anantadwi13 · web-flow · commit 01a756f133cf · 2025-10-10T07:37:35.000+07:00
feat: add array datatype on MaxCompute reader
diff --git a/caraml-store-pyspark/scripts/historical_feature_retrieval_job.py b/caraml-store-pyspark/scripts/historical_feature_retrieval_job.py
@@ -358,29 +358,6 @@ class Field(NamedTuple):
     name: str
     type: str
 
-    @property
-    def spark_type(self):
-        """
-        Returns Spark data type that corresponds to the field's Feast type
-        """
-        feast_to_spark_type_mapping = {
-            "bytes": "binary",
-            "string": "string",
-            "int32": "int",
-            "int64": "bigint",
-            "double": "double",
-            "float": "float",
-            "bool": "boolean",
-            "bytes_list": "array<binary>",
-            "string_list": "array<string>",
-            "int32_list": "array<int>",
-            "int64_list": "array<bigint>",
-            "double_list": "array<double>",
-            "float_list": "array<float>",
-            "bool_list": "array<boolean>",
-        }
-        return feast_to_spark_type_mapping[self.type.lower()]
-
 
 class FeatureTable(NamedTuple):
     """
@@ -463,6 +440,43 @@ def entity_selections(self):
         return [f"{self.field_mapping.get(entity, entity)} as {entity}" for entity in self.entities]
 
 
+def _spark_type(field: Field, source: Source) -> str:
+    if isinstance(source, MaxComputeSource):
+        return {
+            "bytes": "tinyint",
+            "string": "string",
+            "int32": "int",
+            "int64": "bigint",
+            "double": "double",
+            "float": "float",
+            "bool": "boolean",
+            "bytes_list": "array<tinyint>",
+            "string_list": "array<string>",
+            "int32_list": "array<int>",
+            "int64_list": "array<bigint>",
+            "double_list": "array<double>",
+            "float_list": "array<float>",
+            "bool_list": "array<boolean>",
+        }[field.type.lower()]
+    else:
+        return {
+            "bytes": "binary",
+            "string": "string",
+            "int32": "int",
+            "int64": "bigint",
+            "double": "double",
+            "float": "float",
+            "bool": "boolean",
+            "bytes_list": "array<binary>",
+            "string_list": "array<string>",
+            "int32_list": "array<int>",
+            "int64_list": "array<bigint>",
+            "double_list": "array<double>",
+            "float_list": "array<float>",
+            "bool_list": "array<boolean>",
+        }[field.type.lower()]
+
+
 def _map_column(df: DataFrame, col_mapping: Dict[str, str]):
     source_to_alias_map = {v: k for k, v in col_mapping.items()}
     projection = {}
@@ -820,15 +834,16 @@ def _read_and_verify_feature_table_df_from_source(
     feature_table_dtypes = dict(mapped_source_df.dtypes)
     for field in feature_table.entities + feature_table.features:
         column_type = feature_table_dtypes.get(field.name)
+        spark_type = _spark_type(field, source)
 
-        if column_type != field.spark_type:
-            if _type_casting_allowed(field.spark_type, column_type):
+        if column_type != spark_type:
+            if _type_casting_allowed(spark_type, column_type):
                 mapped_source_df = mapped_source_df.withColumn(
-                    field.name, col(field.name).cast(field.spark_type)
+                    field.name, col(field.name).cast(spark_type)
                 )
             else:
                 raise SchemaError(
-                    f"{field.name} should be of {field.spark_type} type, but is {column_type} instead"
+                    f"{field.name} should be of {spark_type} type, but is {column_type} instead"
                 )
 
     for timestamp_column in [
@@ -916,15 +931,16 @@ def retrieve_historical_features(
         for feature_table, source in zip(feature_tables, feature_tables_sources)
     ]
 
-    expected_entities = []
+    expected_entities: List[Field] = []
     for feature_table in feature_tables:
         expected_entities.extend(feature_table.entities)
 
     entity_dtypes = dict(entity_df.dtypes)
     for expected_entity in expected_entities:
-        if entity_dtypes.get(expected_entity.name) != expected_entity.spark_type:
+        spark_type = _spark_type(expected_entity, entity_source)
+        if entity_dtypes.get(expected_entity.name) != spark_type:
             raise SchemaError(
-                f"{expected_entity.name} ({expected_entity.spark_type}) is not present in the entity dataframe."
+                f"{expected_entity.name} ({spark_type}) is not present in the entity dataframe."
             )
 
     entity_df.cache()
diff --git a/caraml-store-spark/build.gradle b/caraml-store-spark/build.gradle
@@ -60,11 +60,14 @@ dependencies {
     testImplementation 'com.github.tomakehurst:wiremock-jre8:2.26.3'
     testImplementation "com.dimafeng:testcontainers-scala-kafka_$scalaVersion:0.40.12"
     testRuntimeOnly 'com.vladsch.flexmark:flexmark-all:0.35.10'
-    implementation files('./prebuilt-jars/custom-dialect.jar')
-    compileOnly('com.aliyun.odps:odps-jdbc:3.8.2') {
+    compileOnly('com.aliyun.odps:odps-jdbc:3.10.1') {
         exclude group: 'org.antlr', module: 'antlr4-runtime'
     }
 
+    // to fix error in unit test.
+    // last time it was successful because aliyun-odps-jdbc 3.8.2 depends on odps-sdk-core 0.51.5.
+    // then odps-sdk-core depends on jackson-databind 2.15.2.
+    testImplementation "com.fasterxml.jackson.core:jackson-databind:2.15.2"
 }
 application {
     mainClassName = 'dev.caraml.spark.IngestionJob'
@@ -87,7 +90,7 @@ def containerRegistry = System.getenv('DOCKER_REGISTRY')
 docker {
     dependsOn shadowJar
     dockerfile file('docker/Dockerfile')
-    files shadowJar.outputs, "$rootDir/caraml-store-pyspark/scripts", "$rootDir/caraml-store-spark/prebuilt-jars/custom-dialect.jar"
+    files shadowJar.outputs, "$rootDir/caraml-store-pyspark/scripts"
     copySpec.with {
         from("$rootDir/caraml-store-pyspark") {
             include 'templates/**'
diff --git a/caraml-store-spark/docker/Dockerfile b/caraml-store-spark/docker/Dockerfile
@@ -2,15 +2,16 @@ FROM --platform=linux/amd64 apache/spark-py:v3.1.3
 
 ARG GCS_CONNECTOR_VERSION=2.2.5
 ARG BQ_CONNECTOR_VERSION=0.27.1
-ARG ODPS_JDBC_CONNECTOR=3.8.2
+ARG ODPS_JDBC_CONNECTOR=3.10.1
 ARG HADOOP_ALIYUN_VERSION=3.2.0
 ARG ALIYUN_SDK_OSS_VERSION=2.8.3
 ARG JDOM_VERSION=1.1
 
 USER root
 ADD https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop2-${GCS_CONNECTOR_VERSION}.jar /opt/spark/jars
 ADD https://repo1.maven.org/maven2/com/google/cloud/spark/spark-bigquery-with-dependencies_2.12/${BQ_CONNECTOR_VERSION}/spark-bigquery-with-dependencies_2.12-${BQ_CONNECTOR_VERSION}.jar /opt/spark/jars
-ADD https://github.com/aliyun/aliyun-odps-jdbc/releases/download/v${ODPS_JDBC_CONNECTOR}/odps-jdbc-${ODPS_JDBC_CONNECTOR}-jar-with-dependencies.jar /opt/spark/jars
+# aliyun odps jdbc with dependencies
+ADD https://github.com/aliyun/aliyun-odps-jdbc/releases/download/v${ODPS_JDBC_CONNECTOR}/odps-jdbc-${ODPS_JDBC_CONNECTOR}.jar /opt/spark/jars
 ADD https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aliyun/${HADOOP_ALIYUN_VERSION}/hadoop-aliyun-${HADOOP_ALIYUN_VERSION}.jar /opt/spark/jars
 ADD https://repo1.maven.org/maven2/com/aliyun/oss/aliyun-sdk-oss/${ALIYUN_SDK_OSS_VERSION}/aliyun-sdk-oss-${ALIYUN_SDK_OSS_VERSION}.jar /opt/spark/jars
 ADD https://repo1.maven.org/maven2/org/jdom/jdom/${JDOM_VERSION}/jdom-${JDOM_VERSION}.jar /opt/spark/jars
@@ -20,6 +21,5 @@ RUN pip install Jinja2==3.1.2
 RUN mkdir -p /dev
 
 ADD caraml-spark-application-with-dependencies.jar /opt/spark/jars
-ADD custom-dialect.jar /opt/spark/jars
 ADD templates /opt/spark/work-dir/
 ADD historical_feature_retrieval_job.py /opt/spark/work-dir
diff --git a/caraml-store-spark/prebuilt-jars/custom-dialect.jar b/caraml-store-spark/prebuilt-jars/custom-dialect.jar
diff --git a/caraml-store-spark/src/main/scala/dev/caraml/spark/odps/CustomDialect.scala b/caraml-store-spark/src/main/scala/dev/caraml/spark/odps/CustomDialect.scala
@@ -1,9 +1,7 @@
 package dev.caraml.spark.odps
-import org.apache.spark.sql.jdbc.JdbcDialect
-import org.apache.spark.sql.jdbc.JdbcType
-import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.jdbc.{JdbcDialect, JdbcType}
 import org.apache.spark.sql.types._
-import org.apache.hbase.thirdparty.org.eclipse.jetty.util.ajax.JSON
+import java.sql.SQLException
 
 class CustomDialect extends JdbcDialect {
   override def canHandle(url: String): Boolean = {
@@ -14,6 +12,79 @@ class CustomDialect extends JdbcDialect {
     s"$colName"
   }
 
+  /*
+    TODO: currently unsupported types
+    - ARRAY<DECIMAL(precision,scale)>
+    - ARRAY<VARCHAR(n)>    --> temporarily map it as a string
+    - ARRAY<CHAR(n)>    --> temporarily map it as a string
+    - ARRAY<DATE>
+    - ARRAY<DATETIME>
+    - ARRAY<TIMESTAMP>
+    - ARRAY<TIMESTAMP_NTZ>
+    - ARRAY<INTERVAL>
+
+    typeName below were obtained from https://www.alibabacloud.com/help/en/maxcompute/user-guide/maxcompute-v2-0-data-type-edition
+   */
+  private def getCommonCatalystType(typeName: String): Option[DataType] = {
+    typeName.toUpperCase() match {
+      case "TINYINT"  => Option(ByteType)
+      case "SMALLINT" => Option(ShortType)
+      case "INT"      => Option(IntegerType)
+      case "BIGINT"   => Option(LongType)
+      case "BINARY"   => Option(BinaryType)
+      case "FLOAT"    => Option(FloatType)
+      case "DOUBLE"   => Option(DoubleType)
+//      case s if s.startsWith("DECIMAL") =>
+//        val mdat = s.stripPrefix("DECIMAL(").stripSuffix(")").split(",")
+//        if (mdat.length == 2) {
+//          val precision = mdat(0).toInt
+//          val scale = mdat(1).toInt
+//          Option(DecimalType(min(precision, DecimalType.MAX_PRECISION), min(scale, DecimalType.MAX_SCALE)))
+//        } else {
+//          Option(DecimalType.SYSTEM_DEFAULT)
+//        }
+//      case s if s.startsWith("VARCHAR") => Option(VarcharType(s.stripPrefix("VARCHAR(").stripSuffix(")").toInt))
+//      case s if s.startsWith("CHAR") => Option(CharType(s.stripPrefix("CHAR(").stripSuffix(")").toInt))
+      case s if s.startsWith("VARCHAR") => Option(StringType)
+      case s if s.startsWith("CHAR")    => Option(StringType)
+      case "STRING"                     => Option(StringType)
+//      case "DATE" => Option(DateType)
+//      case "DATETIME" => Option(TimestampType)
+//      case "TIMESTAMP" => Option(TimestampType)
+//      case "TIMESTAMP_NTZ" => Option(TimestampType)
+      case "BOOLEAN" => Option(BooleanType)
+//      case "INTERVAL" => Option(CalendarIntervalType)
+      case _ => None
+    }
+  }
+
+  override def getCatalystType(
+      sqlType: Int,
+      typeName: String,
+      size: Int,
+      md: MetadataBuilder
+  ): Option[DataType] = {
+    sqlType match {
+      case java.sql.Types.ARRAY =>
+        val elementTypeName = typeName.toUpperCase().stripPrefix("ARRAY<").stripSuffix(">")
+        val elementType     = getCommonCatalystType(elementTypeName).map(ArrayType(_))
+
+        if (elementType.isEmpty) {
+          throw new SQLException(s"Unsupported type $typeName")
+        }
+        logDebug(
+          s"CustomDialect sqlType: $sqlType md: ${md.build().toString()} size: $size typeName: $typeName elementType: ${elementType.getOrElse(ArrayType(NullType)).elementType}"
+        )
+        elementType
+      case _ =>
+        val dataType = getCommonCatalystType(typeName.toUpperCase())
+        logDebug(
+          s"CustomDialect sqlType: $sqlType md: ${md.build().toString()} size: $size typeName: $typeName dataType: $dataType"
+        )
+        dataType
+    }
+  }
+
   override def getJDBCType(dt: DataType): Option[JdbcType] = {
     dt match {
       case IntegerType => Option(JdbcType("INTEGER", java.sql.Types.INTEGER))
diff --git a/caraml-store-spark/src/main/scala/dev/caraml/spark/sources/maxCompute/MaxComputeReader.scala b/caraml-store-spark/src/main/scala/dev/caraml/spark/sources/maxCompute/MaxComputeReader.scala
@@ -1,10 +1,10 @@
 package dev.caraml.spark.sources.maxCompute
 
-import dev.caraml.spark.{MaxComputeSource, MaxComputeConfig}
+import dev.caraml.spark.{MaxComputeConfig, MaxComputeSource}
 import org.joda.time.DateTime
 import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.jdbc.JdbcDialects
-import com.caraml.odps.CustomDialect
+import dev.caraml.spark.odps.CustomDialect
 import org.apache.log4j.Logger
 
 object MaxComputeReader {