homeaway · srinugajjala · May 12, 2025 · Mar 7, 2024 · Mar 7, 2024 · Apr 24, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html)
 
+
+## [0.1.85] - 2025-04-10
+Addressing NULL and invalid subnet issue
+### Changed
+api/src/main/java/com/homeaway/datapullclient/process/DataPullTask.java
+
+## [0.1.84] - 2025-02-19
+Taking hive input config from user to override default values
+### Changed
+-core/src/main/scala/core/Migration.scala
+-core/src/main/scala/core/DataFrameFromTo.scala
+
 ## [0.1.83] - 2025-01-24
 fix java.util.NoSuchElementException: key not found: url
 ### Changed

diff --git a/api/src/main/java/com/homeaway/datapullclient/process/DataPullTask.java b/api/src/main/java/com/homeaway/datapullclient/process/DataPullTask.java
@@ -427,12 +427,31 @@ private JobFlowInstancesConfig getJobFlowInstancesConfig(EMRProperties emrProper
             subnets.add(0,clusterProperties.getSubnetId());
         }
 
-        Set<String> subnets_deduped = new LinkedHashSet<>(subnets);
-        subnets.clear();
-        subnets.addAll(subnets_deduped);
+//      Introducing below logic to address null and invalid subnet issue
+        String getSubnetId = clusterProperties.getSubnetId();
+        String finalSubnetId;
+
+
+        if (StringUtils.isNotBlank(getSubnetId) && getSubnetId.startsWith("subnet-")) {
+            finalSubnetId = getSubnetId;
+            System.out.println("Subnet '" + finalSubnetId + "' provided by the user will be used for EMR cluster creation.");
+        } else {
+            if (StringUtils.isNotBlank(getSubnetId)) {
+                System.out.println("The user provided an invalid value '" + getSubnetId + "' for subnet. Hence, default subnet pool will be used for EMR creation.");
+            } else {
+                System.out.println("The user either provided a NULL value for the subnet or did not specify subnet in the payload. Hence, the default subnet pool will be used for EMR creation.");
+            }
+
+            Set<String> subnetsDeduped = new LinkedHashSet<>(subnets);
+            subnets.clear();
+            subnets.addAll(subnetsDeduped);
+
+            finalSubnetId = subnets.get(0);
+            System.out.println("EMR cluster will be created using a subnet from the default subnet pool: " + finalSubnetId);
+        }
 
         final JobFlowInstancesConfig jobConfig = new JobFlowInstancesConfig()
-                .withEc2SubnetIds(subnets.get(0)) 
+                .withEc2SubnetIds(finalSubnetId)
                 .withInstanceFleets(masterInstanceFleetConfig)
                 .withKeepJobFlowAliveWhenNoSteps(!Boolean.valueOf(Objects.toString
                         (this.clusterProperties.getTerminateClusterAfterExecution(), "true")));
@@ -642,4 +661,5 @@ private ListClustersResult retryListClusters(final AmazonElasticMapReduce emr, f
 
         return listClustersResult;
     }
-}
+
+}
diff --git a/core/pom.xml b/core/pom.xml
@@ -375,4 +375,4 @@
         </dependency>
 
     </dependencies>
-</project>
+</project>
diff --git a/core/src/main/scala/core/DataFrameFromTo.scala b/core/src/main/scala/core/DataFrameFromTo.scala
@@ -1213,7 +1213,38 @@ class DataFrameFromTo(appConfig: AppConfig, pipeline: String) extends Serializab
     df_temp.write.mode(savemode).options(jdbcOptions).jdbc(db_url, table, connectionProperties)
   }
 
-  def hiveToDataFrame(sparkSession: org.apache.spark.sql.SparkSession, query: String): org.apache.spark.sql.DataFrame = {
+  def hiveToDataFrame(sparkSession: SparkSession, query: String, properties: Option[JSONObject] = None): DataFrame = {
+
+    println("Properties Passed:" + properties)
+
+    val defaultConfigs = Map(
+      "spark.sql.hive.caseSensitiveInferenceMode" -> "INFER_ONLY",
+      "spark.sql.hive.metastore.version" -> "1.2.1",
+      "spark.sql.hive.metastore.jars" -> "builtin"
+    )
+    val parsedProperties = properties.map { jsonObj =>
+      import scala.collection.JavaConverters._
+      val propertyMap = scala.collection.mutable.Map[String, String]()
+      val iter = jsonObj.keys()
+
+      while (iter.hasNext) {
+        val key = iter.next().toString
+        propertyMap(key) = jsonObj.getString(key)
+      }
+      propertyMap.toMap
+    }.getOrElse(Map.empty[String, String])
+
+    val finalConfigs = defaultConfigs ++ parsedProperties
+
+    finalConfigs.foreach { case (key, value) =>
+      sparkSession.sqlContext.setConf(key, value)
+    }
+
+    println("Configurations applied:")
+    finalConfigs.foreach { case (key, value) =>
+      println(s"$key -> ${sparkSession.sqlContext.getConf(key)}")
+    }
+
     sparkSession.sql(query)
   }
 

diff --git a/core/src/main/scala/core/Migration.scala b/core/src/main/scala/core/Migration.scala
@@ -615,7 +615,13 @@ class Migration extends SparkListener {
       )
     }
     else if (platform == "hive") {
-      dataframeFromTo.hiveToDataFrame(sparkSession, propertiesMap("query"))
+      val properties = if (platformObject.has("properties")) {
+        Option(platformObject.getJSONObject("properties"))
+      } else {
+        None
+      }
+
+      dataframeFromTo.hiveToDataFrame(sparkSession = sparkSession, query = propertiesMap("query"), properties = properties)
     } else if (platform == "mongodb") {
       dataframeFromTo.mongodbToDataFrame(propertiesMap("awsenv"), propertiesMap("cluster"), propertiesMap.getOrElse("overrideconnector", "false"), propertiesMap("database"), propertiesMap("authenticationdatabase"), propertiesMap("collection"), propertiesMap("login"), propertiesMap("password"), sparkSession, propertiesMap("vaultenv"), platformObject.optJSONObject("sparkoptions"), propertiesMap.getOrElse("secretstore", secretStoreDefaultValue), propertiesMap.getOrElse("authenticationenabled", "true"), propertiesMap.getOrElse("tmpfilelocation", null), propertiesMap.getOrElse("samplesize", null), propertiesMap.getOrElse("sslenabled", "false"))
     }