opensource4you · wycccccc · Jan 31, 2023 · Jan 31, 2023 · Feb 19, 2023 · Feb 22, 2023
diff --git a/config/spark2kafka.properties b/config/spark2kafka.properties
@@ -1,9 +1,9 @@
 #Parameters you must configure
 #==============================================================
-#The data source path should be a directory.
+#The data source should be a directory.
 source.path =
 
-#The CSV Column Name.For example:sA=string,sB=integer,sC=boolean...
+#The csv col names.For example:sA=string,sB=integer,sC=boolean...
 column.names =
 
 #Primary keys.For example:sA=string,sB=integer,sC=boolean...
@@ -15,8 +15,8 @@ kafka.bootstrap.servers =
 #Set your topic name.
 topic.name =
 
-#Spark checkpoint path
-checkpoint =
+#Spark checkpoint
+checkpoint.path =
 
 #Parameters that can be selected for configuration
 #==============================================================

diff --git a/docker/start_etl.sh b/docker/start_etl.sh
@@ -24,7 +24,7 @@ declare -r SPARK_VERSION=${SPARK_VERSION:-3.3.1}
 declare -r LOCAL_PATH=$(cd -- "$(dirname -- "${DOCKER_FOLDER}")" &>/dev/null && pwd)
 # ===============================[properties keys]=================================
 declare -r SOURCE_KEY="source.path"
-declare -r CHECKPOINT_KEY="checkpoint"
+declare -r CHECKPOINT_KEY="checkpoint.path"
 # ===============================[spark driver/executor resource]==================
 declare -r RESOURCES_CONFIGS="${RESOURCES_CONFIGS:-"--conf spark.driver.memory=4g --conf spark.executor.memory=4g"}"
 # ===================================[functions]===================================
@@ -89,7 +89,7 @@ function runContainer() {
 
   if [[ "$master" == "spark:"* ]] || [[ "$master" == "local"* ]]; then
     docker run -d --init \
-      --name "csv-kafka-${source_name}" \
+      --name "csv-kafka${source_name}" \
       $network_config \
       -v "$propertiesPath":"$propertiesPath":ro \
       -v "$jar_path":/tmp/astraea-etl.jar:ro \

diff --git a/etl/src/main/scala/org/astraea/etl/DataFrameProcessor.scala b/etl/src/main/scala/org/astraea/etl/DataFrameProcessor.scala
@@ -70,7 +70,14 @@ class DataFrameProcessor(dataFrame: DataFrame) {
         .withColumn(
           "value",
           defaultConverter(
-            map(cols.flatMap(c => List(lit(c.name), col(c.name))): _*)
+            map(
+              cols.flatMap(c =>
+                List(
+                  lit(c.name),
+                  when(col(c.name).isNotNull, col(c.name)).otherwise(lit(null))
+                )
+              ): _*
+            )
           )
         )
         .withColumn(
@@ -171,10 +178,6 @@ object DataFrameProcessor {
 
     private def schema(columns: Seq[DataColumn]): StructType =
       StructType(columns.map { col =>
-        if (col.dataType != DataType.StringType)
-          throw new IllegalArgumentException(
-            "Sorry, only string type is currently supported.Because a problem(astraea #1286) has led to the need to wrap the non-nullable type."
-          )
         StructField(col.name, col.dataType.sparkType)
       })
   }

diff --git a/etl/src/main/scala/org/astraea/etl/Metadata.scala b/etl/src/main/scala/org/astraea/etl/Metadata.scala
@@ -71,7 +71,7 @@ object Metadata {
 
   private[etl] val DEFAULT_PARTITIONS = 15
   private[etl] val DEFAULT_REPLICAS = 1.toShort
-  private[etl] val DEFAULT_RECURSIVE = "ture"
+  private[etl] val DEFAULT_RECURSIVE = "true"
   private[etl] val DEFAULT_CLEAN_SOURCE = "delete"
 
   // Parameters needed to configure ETL.