From 2aa9e735399290c908fb36bba8cb54ad8aec66c0 Mon Sep 17 00:00:00 2001 From: Robert <78926291+Asciax@users.noreply.github.com> Date: Wed, 25 Feb 2026 17:12:59 -0500 Subject: [PATCH 1/6] feat: UNIC-1977 Implement write and insert for ExcelLoader --- build.sbt | 2 +- .../datalake/spark3/loader/ExcelLoader.scala | 26 +++++++++++-- .../datalake/spark3/loader/LoadResolver.scala | 6 +++ .../spark3/loader/ExcelLoaderSpec.scala | 38 ++++++++++++++++++- 4 files changed, 67 insertions(+), 5 deletions(-) diff --git a/build.sbt b/build.sbt index 7a3324ae..7d65b364 100644 --- a/build.sbt +++ b/build.sbt @@ -63,7 +63,7 @@ lazy val `datalake-spark3` = (project in file("datalake-spark3")) "com.microsoft.sqlserver" % "mssql-jdbc" % "8.4.1.jre11" % Provided, "com.microsoft.aad" % "adal4j" % "0.0.2" % Provided, "com.microsoft.azure" % "spark-mssql-connector_2.12" % "1.2.0" % Provided, - "com.crealytics" %% "spark-excel" % "3.5.0_0.20.3" % Provided, + "dev.mauch" %% "spark-excel" % "3.5.5_0.30.2", // % Provided, //Use by ElasticsearchClient "com.softwaremill.sttp.client3" %% "core" % "3.9.2", "com.softwaremill.sttp.client3" %% "json4s" % "3.9.2" exclude("org.json4s", "json4s-core_2.12"), //Exclusion because json4s is used in spark diff --git a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala index bd703149..6c63a64d 100644 --- a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala +++ b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala @@ -1,6 +1,6 @@ package bio.ferlab.datalake.spark3.loader -import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} import java.time.LocalDate @@ -20,6 +20,22 @@ object ExcelLoader extends Loader { .load(location) } + def write(df: DataFrame, + location: String, + databaseName: String, + tableName: String, + partitioning: List[String], + format: String, + options: Map[String, String], + mode: SaveMode): DataFrame = { + df.write + .options(options) + .format(format) + .mode(mode) + .save(location) + df + } + override def overwritePartition(location: String, databaseName: String, tableName: String, @@ -34,7 +50,9 @@ object ExcelLoader extends Loader { df: DataFrame, partitioning: List[String], format: String, - options: Map[String, String])(implicit spark: SparkSession): DataFrame = ??? + options: Map[String, String])(implicit spark: SparkSession): DataFrame = { + write(df, location, databaseName, tableName, partitioning, format, options, SaveMode.Overwrite) + } override def insert(location: String, databaseName: String, @@ -42,7 +60,9 @@ object ExcelLoader extends Loader { updates: DataFrame, partitioning: List[String], format: String, - options: Map[String, String])(implicit spark: SparkSession): DataFrame = ??? + options: Map[String, String])(implicit spark: SparkSession): DataFrame = { + write(updates, location, databaseName, tableName, partitioning, format, options, SaveMode.Append) + } override def upsert(location: String, databaseName: String, diff --git a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/LoadResolver.scala b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/LoadResolver.scala index 9096db5d..62a5c977 100644 --- a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/LoadResolver.scala +++ b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/LoadResolver.scala @@ -50,6 +50,12 @@ object LoadResolver { case (ELASTICSEARCH, OverWrite) => (ds: DatasetConf, df: DataFrame) => ElasticsearchLoader.writeOnce(ds.location, ds.table.map(_.database).getOrElse(""), ds.table.map(_.name).getOrElse(ds.location), df, ds.partitionby, ds.format.sparkFormat, ds.writeoptions) + case (EXCEL, OverWrite) => (ds: DatasetConf, df: DataFrame) => + ExcelLoader.writeOnce(ds.location, ds.table.map(_.database).getOrElse(""), ds.table.map(_.name).getOrElse(ds.location), df, ds.partitionby, ds.format.sparkFormat, ds.writeoptions) + + case (EXCEL, Insert) => (ds: DatasetConf, df: DataFrame) => + ExcelLoader.insert(ds.location, ds.table.map(_.database).getOrElse(""), ds.table.map(_.name).getOrElse(ds.location), df, ds.partitionby, ds.format.sparkFormat, ds.writeoptions) + //generic fallback behaviours case (f, OverWrite) => (ds: DatasetConf, df: DataFrame) => diff --git a/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala b/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala index 886a0740..a29f495e 100644 --- a/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala +++ b/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala @@ -3,16 +3,29 @@ package bio.ferlab.datalake.spark3.loader import bio.ferlab.datalake.commons.config.Format.EXCEL import bio.ferlab.datalake.spark3.testutils.AirportInput import bio.ferlab.datalake.testutils.SparkSpec +import org.scalatest.BeforeAndAfterAll -class ExcelLoaderSpec extends SparkSpec { +import java.nio.file.{Files, Paths} + +class ExcelLoaderSpec extends SparkSpec with BeforeAndAfterAll{ import spark.implicits._ val folderPath: String = getClass.getClassLoader.getResource("raw/landing/").getPath + val outputLocation: String = folderPath + "output/airports.xlsx" val expected: Seq[AirportInput] = Seq( AirportInput("1", "YYC", "Calgary Int airport"), AirportInput("2", "YUL", "Montreal Int airport") ) + val initialDF = expected.toDF() + + override def beforeAll(): Unit = { + super.beforeAll() + val outputPath = Paths.get(outputLocation) + if (Files.exists(outputPath)) { + Files.delete(outputPath) + } + } "read" should "read xlsx file as a DataFrame" in { val fileLocation = folderPath + "airports.xlsx" @@ -34,5 +47,28 @@ class ExcelLoaderSpec extends SparkSpec { .collect() should contain theSameElementsAs expected } + "writeOnce" should "write a dataframe to a file" in { + ExcelLoader.writeOnce(outputLocation, "", "", initialDF, Nil, EXCEL.sparkFormat, Map("header" -> "true")) + + val result = ExcelLoader.read(outputLocation, EXCEL.sparkFormat, Map("header" -> "true")) + + result.as[AirportInput].collect() should contain theSameElementsAs expected + } + + "insert" should "append a dataframe to an existing file" in { + // Overwrite with initial data first + ExcelLoader.writeOnce(outputLocation, "", "", initialDF, Nil, EXCEL.sparkFormat, Map("header" -> "true")) + + // Prepare new data and append it + val updates = Seq(AirportInput("3", "YVR", "Vancouver Int airport")).toDF() + val expectedDfValues = expected ++ Seq(AirportInput("3", "YVR", "Vancouver Int airport")) + + ExcelLoader.insert(outputLocation, "", "", updates, Nil, EXCEL.sparkFormat, Map("header" -> "true")) + + val result = ExcelLoader.read(outputLocation, EXCEL.sparkFormat, Map("header" -> "true")) + + result.as[AirportInput].collect() should contain theSameElementsAs expectedDfValues + } + } From a0af8624677c870303967f6427593773778b7d07 Mon Sep 17 00:00:00 2001 From: Robert <78926291+Asciax@users.noreply.github.com> Date: Thu, 26 Feb 2026 15:36:48 -0500 Subject: [PATCH 2/6] feat: Changed lib to provided, added Excel tests, required schema on write --- build.sbt | 2 +- .../datalake/spark3/loader/ExcelLoader.scala | 5 +- .../spark3/loader/ExcelLoaderSpec.scala | 71 ++++++++++++++++--- 3 files changed, 68 insertions(+), 10 deletions(-) diff --git a/build.sbt b/build.sbt index 7d65b364..6926c40d 100644 --- a/build.sbt +++ b/build.sbt @@ -63,7 +63,7 @@ lazy val `datalake-spark3` = (project in file("datalake-spark3")) "com.microsoft.sqlserver" % "mssql-jdbc" % "8.4.1.jre11" % Provided, "com.microsoft.aad" % "adal4j" % "0.0.2" % Provided, "com.microsoft.azure" % "spark-mssql-connector_2.12" % "1.2.0" % Provided, - "dev.mauch" %% "spark-excel" % "3.5.5_0.30.2", // % Provided, + "dev.mauch" %% "spark-excel" % "3.5.5_0.30.2" % Provided, //Use by ElasticsearchClient "com.softwaremill.sttp.client3" %% "core" % "3.9.2", "com.softwaremill.sttp.client3" %% "json4s" % "3.9.2" exclude("org.json4s", "json4s-core_2.12"), //Exclusion because json4s is used in spark diff --git a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala index 6c63a64d..907910a2 100644 --- a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala +++ b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala @@ -28,6 +28,9 @@ object ExcelLoader extends Loader { format: String, options: Map[String, String], mode: SaveMode): DataFrame = { + // Excel format requires the schema to be non-empty, does not support empty schema dataframe writes + require(df.schema.nonEmpty, "DataFrame must have a valid schema with at least one column.") + df.write .options(options) .format(format) @@ -72,7 +75,7 @@ object ExcelLoader extends Loader { partitioning: List[String], format: String, options: Map[String, String])(implicit spark: SparkSession): DataFrame = ??? - + override def scd1(location: String, databaseName: String, tableName: String, diff --git a/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala b/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala index a29f495e..3e7774d3 100644 --- a/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala +++ b/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala @@ -3,11 +3,12 @@ package bio.ferlab.datalake.spark3.loader import bio.ferlab.datalake.commons.config.Format.EXCEL import bio.ferlab.datalake.spark3.testutils.AirportInput import bio.ferlab.datalake.testutils.SparkSpec -import org.scalatest.BeforeAndAfterAll +import org.apache.spark.sql.DataFrame +import org.scalatest.BeforeAndAfterEach import java.nio.file.{Files, Paths} -class ExcelLoaderSpec extends SparkSpec with BeforeAndAfterAll{ +class ExcelLoaderSpec extends SparkSpec with BeforeAndAfterEach { import spark.implicits._ @@ -17,14 +18,39 @@ class ExcelLoaderSpec extends SparkSpec with BeforeAndAfterAll{ AirportInput("1", "YYC", "Calgary Int airport"), AirportInput("2", "YUL", "Montreal Int airport") ) - val initialDF = expected.toDF() + val simpleExpectedUpdate: Seq[AirportInput] = Seq( + AirportInput("3", "YVR", "Vancouver Int airport") + ) + + val initialDF: DataFrame = expected.toDF() - override def beforeAll(): Unit = { - super.beforeAll() + override def afterEach(): Unit = { + super.afterEach() val outputPath = Paths.get(outputLocation) if (Files.exists(outputPath)) { - Files.delete(outputPath) + cleanUpFilesRecursively(outputPath) + } + } + + /** + * Recursively deletes files and directories at the given path. Necessary because ExcelLoader API v2 + * may create multiple excel partitions when writing to a folder. + * */ + private def cleanUpFilesRecursively(path: java.nio.file.Path): Unit = { + if (Files.isDirectory(path)) { + Files.list(path).forEach(cleanUpFilesRecursively) } + Files.deleteIfExists(path) + } + + private def withInitialDfInFolder(testCode: => Any): Unit = { + ExcelLoader.writeOnce(outputLocation, "", "", initialDF, Nil, EXCEL.sparkFormat, Map("header" -> "true")) + testCode + } + + private def withUpdatedDfInFolder(updates: DataFrame, testCode: String => Any): Unit = { + ExcelLoader.insert(outputLocation, "", "", updates, Nil, EXCEL.sparkFormat, Map("header" -> "true")) + testCode(outputLocation) } "read" should "read xlsx file as a DataFrame" in { @@ -47,6 +73,25 @@ class ExcelLoaderSpec extends SparkSpec with BeforeAndAfterAll{ .collect() should contain theSameElementsAs expected } + it should "throw an exception when the header option is missing" in { + val fileLocation: String = folderPath + "airports.xlsx" + + an[IllegalArgumentException] should be thrownBy { + ExcelLoader.read(fileLocation, EXCEL.sparkFormat, Map.empty, None, None) + } + } + + it should "read folder containing multiple Excel files as a DataFrame" in withInitialDfInFolder { + withUpdatedDfInFolder(simpleExpectedUpdate.toDF(), { folderLocation => + + val result = ExcelLoader.read(folderLocation, EXCEL.sparkFormat, Map("header" -> "true"), None, None) + + result + .as[AirportInput] + .collect() should contain theSameElementsAs (expected ++ simpleExpectedUpdate) + }) + } + "writeOnce" should "write a dataframe to a file" in { ExcelLoader.writeOnce(outputLocation, "", "", initialDF, Nil, EXCEL.sparkFormat, Map("header" -> "true")) @@ -55,7 +100,18 @@ class ExcelLoaderSpec extends SparkSpec with BeforeAndAfterAll{ result.as[AirportInput].collect() should contain theSameElementsAs expected } - "insert" should "append a dataframe to an existing file" in { + "insert" should "append a dataframe to an existing file" in withInitialDfInFolder { + withUpdatedDfInFolder(simpleExpectedUpdate.toDF(), { + folderLocation => + val result = ExcelLoader.read(folderLocation, EXCEL.sparkFormat, Map("header" -> "true"), None, None) + + result + .as[AirportInput] + .collect() should contain theSameElementsAs (expected ++ simpleExpectedUpdate) + }) + } + + "insert2" should "append a dataframe to an existing file" in { // Overwrite with initial data first ExcelLoader.writeOnce(outputLocation, "", "", initialDF, Nil, EXCEL.sparkFormat, Map("header" -> "true")) @@ -70,5 +126,4 @@ class ExcelLoaderSpec extends SparkSpec with BeforeAndAfterAll{ result.as[AirportInput].collect() should contain theSameElementsAs expectedDfValues } - } From 33dfdc2fce5190cd4ab9f87092c4a6238608bb11 Mon Sep 17 00:00:00 2001 From: Robert <78926291+Asciax@users.noreply.github.com> Date: Thu, 26 Feb 2026 15:55:21 -0500 Subject: [PATCH 3/6] refactor: Correct docstrings, improve test spec, require header since required by spark-excel --- .../datalake/spark3/loader/ExcelLoader.scala | 1 + .../spark3/loader/ExcelLoaderSpec.scala | 28 ++++++++----------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala index 907910a2..e05a0bd6 100644 --- a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala +++ b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala @@ -30,6 +30,7 @@ object ExcelLoader extends Loader { mode: SaveMode): DataFrame = { // Excel format requires the schema to be non-empty, does not support empty schema dataframe writes require(df.schema.nonEmpty, "DataFrame must have a valid schema with at least one column.") + require(options.isDefinedAt("header"), "Expecting [header] to be defined in readOptions.") df.write .options(options) diff --git a/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala b/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala index 3e7774d3..03daf496 100644 --- a/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala +++ b/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala @@ -33,7 +33,7 @@ class ExcelLoaderSpec extends SparkSpec with BeforeAndAfterEach { } /** - * Recursively deletes files and directories at the given path. Necessary because ExcelLoader API v2 + * Recursively deletes files and directories at the given path. Necessary because spark-excel format API v2 * may create multiple excel partitions when writing to a folder. * */ private def cleanUpFilesRecursively(path: java.nio.file.Path): Unit = { @@ -100,6 +100,17 @@ class ExcelLoaderSpec extends SparkSpec with BeforeAndAfterEach { result.as[AirportInput].collect() should contain theSameElementsAs expected } + it should "overwrite existing files when writing to the same folder" in withInitialDfInFolder { + //Overwriting the same location + ExcelLoader.writeOnce(outputLocation, "", "", simpleExpectedUpdate.toDF(), Nil, EXCEL.sparkFormat, Map("header" -> "true")) + + val result = ExcelLoader.read(outputLocation, EXCEL.sparkFormat, Map("header" -> "true"), None, None) + + result + .as[AirportInput] + .collect() should contain theSameElementsAs simpleExpectedUpdate + } + "insert" should "append a dataframe to an existing file" in withInitialDfInFolder { withUpdatedDfInFolder(simpleExpectedUpdate.toDF(), { folderLocation => @@ -111,19 +122,4 @@ class ExcelLoaderSpec extends SparkSpec with BeforeAndAfterEach { }) } - "insert2" should "append a dataframe to an existing file" in { - // Overwrite with initial data first - ExcelLoader.writeOnce(outputLocation, "", "", initialDF, Nil, EXCEL.sparkFormat, Map("header" -> "true")) - - // Prepare new data and append it - val updates = Seq(AirportInput("3", "YVR", "Vancouver Int airport")).toDF() - val expectedDfValues = expected ++ Seq(AirportInput("3", "YVR", "Vancouver Int airport")) - - ExcelLoader.insert(outputLocation, "", "", updates, Nil, EXCEL.sparkFormat, Map("header" -> "true")) - - val result = ExcelLoader.read(outputLocation, EXCEL.sparkFormat, Map("header" -> "true")) - - result.as[AirportInput].collect() should contain theSameElementsAs expectedDfValues - } - } From 67bafa98adcec465c388e183d35448403d09e377 Mon Sep 17 00:00:00 2001 From: Robert <78926291+Asciax@users.noreply.github.com> Date: Wed, 4 Mar 2026 15:49:07 -0500 Subject: [PATCH 4/6] refactor: Use , implement suggested changes --- .../datalake/spark3/loader/ExcelLoader.scala | 9 +- .../spark3/loader/ExcelLoaderSpec.scala | 95 +++++++++---------- 2 files changed, 47 insertions(+), 57 deletions(-) diff --git a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala index e05a0bd6..7637bd1f 100644 --- a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala +++ b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala @@ -20,11 +20,8 @@ object ExcelLoader extends Loader { .load(location) } - def write(df: DataFrame, + private def write(df: DataFrame, location: String, - databaseName: String, - tableName: String, - partitioning: List[String], format: String, options: Map[String, String], mode: SaveMode): DataFrame = { @@ -55,7 +52,7 @@ object ExcelLoader extends Loader { partitioning: List[String], format: String, options: Map[String, String])(implicit spark: SparkSession): DataFrame = { - write(df, location, databaseName, tableName, partitioning, format, options, SaveMode.Overwrite) + write(df, location, format, options, SaveMode.Overwrite) } override def insert(location: String, @@ -65,7 +62,7 @@ object ExcelLoader extends Loader { partitioning: List[String], format: String, options: Map[String, String])(implicit spark: SparkSession): DataFrame = { - write(updates, location, databaseName, tableName, partitioning, format, options, SaveMode.Append) + write(updates, location, format, options, SaveMode.Append) } override def upsert(location: String, diff --git a/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala b/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala index 03daf496..4ceffbaf 100644 --- a/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala +++ b/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala @@ -8,51 +8,33 @@ import org.scalatest.BeforeAndAfterEach import java.nio.file.{Files, Paths} -class ExcelLoaderSpec extends SparkSpec with BeforeAndAfterEach { +class ExcelLoaderSpec extends SparkSpec { import spark.implicits._ val folderPath: String = getClass.getClassLoader.getResource("raw/landing/").getPath - val outputLocation: String = folderPath + "output/airports.xlsx" + val outputLocation: String = "output/airports.xlsx" val expected: Seq[AirportInput] = Seq( AirportInput("1", "YYC", "Calgary Int airport"), AirportInput("2", "YUL", "Montreal Int airport") ) - val simpleExpectedUpdate: Seq[AirportInput] = Seq( + val expectedUpdate: Seq[AirportInput] = Seq( AirportInput("3", "YVR", "Vancouver Int airport") ) val initialDF: DataFrame = expected.toDF() - override def afterEach(): Unit = { - super.afterEach() - val outputPath = Paths.get(outputLocation) - if (Files.exists(outputPath)) { - cleanUpFilesRecursively(outputPath) - } - } - - /** - * Recursively deletes files and directories at the given path. Necessary because spark-excel format API v2 - * may create multiple excel partitions when writing to a folder. - * */ - private def cleanUpFilesRecursively(path: java.nio.file.Path): Unit = { - if (Files.isDirectory(path)) { - Files.list(path).forEach(cleanUpFilesRecursively) - } - Files.deleteIfExists(path) + private def withInitialDfInFolder(rootPath: String)(testCode: String => Any): Unit = { + val dfPath: String = rootPath + outputLocation + ExcelLoader.writeOnce(dfPath, "", "", initialDF, Nil, EXCEL.sparkFormat, Map("header" -> "true")) + testCode(dfPath) } - private def withInitialDfInFolder(testCode: => Any): Unit = { - ExcelLoader.writeOnce(outputLocation, "", "", initialDF, Nil, EXCEL.sparkFormat, Map("header" -> "true")) + private def withUpdatedDfInFolder(updates: DataFrame, path: String)(testCode: => Any): Unit = { + ExcelLoader.insert(path, "", "", updates, Nil, EXCEL.sparkFormat, Map("header" -> "true")) testCode } - private def withUpdatedDfInFolder(updates: DataFrame, testCode: String => Any): Unit = { - ExcelLoader.insert(outputLocation, "", "", updates, Nil, EXCEL.sparkFormat, Map("header" -> "true")) - testCode(outputLocation) - } - "read" should "read xlsx file as a DataFrame" in { val fileLocation = folderPath + "airports.xlsx" @@ -76,50 +58,61 @@ class ExcelLoaderSpec extends SparkSpec with BeforeAndAfterEach { it should "throw an exception when the header option is missing" in { val fileLocation: String = folderPath + "airports.xlsx" + val options = Map.empty[String, String] + val databaseName, tableName : Option[String] = None + an[IllegalArgumentException] should be thrownBy { - ExcelLoader.read(fileLocation, EXCEL.sparkFormat, Map.empty, None, None) + ExcelLoader.read(fileLocation, EXCEL.sparkFormat, options, databaseName, tableName) } } - it should "read folder containing multiple Excel files as a DataFrame" in withInitialDfInFolder { - withUpdatedDfInFolder(simpleExpectedUpdate.toDF(), { folderLocation => + it should "read folder containing multiple Excel files as a DataFrame" in withOutputFolder("root") { root => + withInitialDfInFolder(root) { folderLocation => + withUpdatedDfInFolder(expectedUpdate.toDF(), folderLocation) { - val result = ExcelLoader.read(folderLocation, EXCEL.sparkFormat, Map("header" -> "true"), None, None) + val result = ExcelLoader.read(folderLocation, EXCEL.sparkFormat, Map("header" -> "true"), None, None) - result - .as[AirportInput] - .collect() should contain theSameElementsAs (expected ++ simpleExpectedUpdate) - }) + result + .as[AirportInput] + .collect() should contain theSameElementsAs (expected ++ expectedUpdate) + } + } } - "writeOnce" should "write a dataframe to a file" in { - ExcelLoader.writeOnce(outputLocation, "", "", initialDF, Nil, EXCEL.sparkFormat, Map("header" -> "true")) + "writeOnce" should "write a dataframe to a file" in withOutputFolder("root") { root => + withInitialDfInFolder(root) { folderLocation => + + val result = ExcelLoader.read(folderLocation, EXCEL.sparkFormat, Map("header" -> "true")) - val result = ExcelLoader.read(outputLocation, EXCEL.sparkFormat, Map("header" -> "true")) + result.as[AirportInput].collect() should contain theSameElementsAs expected - result.as[AirportInput].collect() should contain theSameElementsAs expected + } } - it should "overwrite existing files when writing to the same folder" in withInitialDfInFolder { - //Overwriting the same location - ExcelLoader.writeOnce(outputLocation, "", "", simpleExpectedUpdate.toDF(), Nil, EXCEL.sparkFormat, Map("header" -> "true")) + it should "overwrite existing files when writing to the same folder" in withOutputFolder("root") { root => + withInitialDfInFolder(root) { folderLocation => - val result = ExcelLoader.read(outputLocation, EXCEL.sparkFormat, Map("header" -> "true"), None, None) + //Overwriting the same location + ExcelLoader.writeOnce(outputLocation, "", "", expectedUpdate.toDF(), Nil, EXCEL.sparkFormat, Map("header" -> "true")) - result - .as[AirportInput] - .collect() should contain theSameElementsAs simpleExpectedUpdate + val result = ExcelLoader.read(folderLocation, EXCEL.sparkFormat, Map("header" -> "true"), None, None) + + result + .as[AirportInput] + .collect() should contain theSameElementsAs expectedUpdate + } } - "insert" should "append a dataframe to an existing file" in withInitialDfInFolder { - withUpdatedDfInFolder(simpleExpectedUpdate.toDF(), { - folderLocation => + "insert" should "append a dataframe to an existing file" in withOutputFolder("root") { root => + withInitialDfInFolder(root) { folderLocation => + withUpdatedDfInFolder(expectedUpdate.toDF(), folderLocation) { val result = ExcelLoader.read(folderLocation, EXCEL.sparkFormat, Map("header" -> "true"), None, None) result .as[AirportInput] - .collect() should contain theSameElementsAs (expected ++ simpleExpectedUpdate) - }) + .collect() should contain theSameElementsAs (expected ++ expectedUpdate) + } + } } } From da4adf7f6d95e4d15f733ab2d1a148340fc02e2f Mon Sep 17 00:00:00 2001 From: Robert <78926291+Asciax@users.noreply.github.com> Date: Wed, 4 Mar 2026 16:03:31 -0500 Subject: [PATCH 5/6] fix: Place correct new path in test --- .../bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala b/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala index 4ceffbaf..432534e7 100644 --- a/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala +++ b/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala @@ -93,7 +93,7 @@ class ExcelLoaderSpec extends SparkSpec { withInitialDfInFolder(root) { folderLocation => //Overwriting the same location - ExcelLoader.writeOnce(outputLocation, "", "", expectedUpdate.toDF(), Nil, EXCEL.sparkFormat, Map("header" -> "true")) + ExcelLoader.writeOnce(folderLocation, "", "", expectedUpdate.toDF(), Nil, EXCEL.sparkFormat, Map("header" -> "true")) val result = ExcelLoader.read(folderLocation, EXCEL.sparkFormat, Map("header" -> "true"), None, None) From 38ac1136959e1eedb664e70dddb6666616fcb2e4 Mon Sep 17 00:00:00 2001 From: Robert <78926291+Asciax@users.noreply.github.com> Date: Wed, 4 Mar 2026 16:58:50 -0500 Subject: [PATCH 6/6] fix: Implement suggested changes --- .../bio/ferlab/datalake/spark3/loader/ExcelLoader.scala | 8 ++++---- .../ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala | 8 +------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala index 7637bd1f..5d3946ee 100644 --- a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala +++ b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/loader/ExcelLoader.scala @@ -21,10 +21,10 @@ object ExcelLoader extends Loader { } private def write(df: DataFrame, - location: String, - format: String, - options: Map[String, String], - mode: SaveMode): DataFrame = { + location: String, + format: String, + options: Map[String, String], + mode: SaveMode): DataFrame = { // Excel format requires the schema to be non-empty, does not support empty schema dataframe writes require(df.schema.nonEmpty, "DataFrame must have a valid schema with at least one column.") require(options.isDefinedAt("header"), "Expecting [header] to be defined in readOptions.") diff --git a/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala b/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala index 432534e7..4ce372ee 100644 --- a/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala +++ b/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/loader/ExcelLoaderSpec.scala @@ -4,9 +4,6 @@ import bio.ferlab.datalake.commons.config.Format.EXCEL import bio.ferlab.datalake.spark3.testutils.AirportInput import bio.ferlab.datalake.testutils.SparkSpec import org.apache.spark.sql.DataFrame -import org.scalatest.BeforeAndAfterEach - -import java.nio.file.{Files, Paths} class ExcelLoaderSpec extends SparkSpec { @@ -58,11 +55,8 @@ class ExcelLoaderSpec extends SparkSpec { it should "throw an exception when the header option is missing" in { val fileLocation: String = folderPath + "airports.xlsx" - val options = Map.empty[String, String] - val databaseName, tableName : Option[String] = None - an[IllegalArgumentException] should be thrownBy { - ExcelLoader.read(fileLocation, EXCEL.sparkFormat, options, databaseName, tableName) + ExcelLoader.read(fileLocation, EXCEL.sparkFormat, readOptions = Map.empty, databaseName = None, tableName = None) } }