Revert "Merge pull request #167 from cugni/spark-3.3.0_delta-2.1.0"

osopardo1 · osopardo1 · commit 2d0a5aa16928 · 2023-08-02T08:54:40.000+02:00
This reverts commit cb19cd4, reversing changes made to d72fea5.
diff --git a/README.md b/README.md
@@ -39,7 +39,7 @@
 4. **Table Tolerance** - Model for sampling fraction and **query accuracy** trade-off. 
 
 
-## Query example with Qbeast
+### What does it mean? - Let's see an example:
 
 <div>
 <img src="./docs/images/spark_delta_demo.gif" width="49%" alt="Demo for Delta format GIF" />
@@ -49,13 +49,25 @@
 
 As you can see above, the Qbeast Spark extension allows **faster** queries with statistically **accurate** sampling.
 
+
 | Format | Execution Time |   Result  |
 |--------|:--------------:|:---------:|
-| Delta  |  ~ 151.3 sec.  | 37.869383 |
-| Qbeast |   ~ 6.6 sec.   | 37.856333 |
+| Delta  | ~ 2.5 min.     | 37.869383 |
+| Qbeast | ~ 6.6 sec.     | 37.856333 |
 
 In this example, **1% sampling** provides the result **x22 times faster** compared to using Delta format, with an **error of 0,034%**.
 
+# Getting Started
+
+>### Warning: DO NOT USE IN PRODUCTION!
+> This project is in an early development phase: there are missing functionalities and the API might change drastically.
+> 
+> Join ⨝ the community to be a part of this project!
+> 
+> See Issues tab to know what is cooking 😎
+
+
+
 
 # Quickstart
 You can run the qbeast-spark application locally on your computer, or using a Docker image we already prepared with the dependencies.
@@ -67,11 +79,11 @@ Download **Spark 3.1.1 with Hadoop 3.2**, unzip it, and create the `SPARK_HOME`
 >:information_source: **Note**: You can use Hadoop 2.7 if desired, but you could have some troubles with different cloud providers' storage, read more about it [here](docs/CloudStorages.md).
 
 ```bash
-wget https://www.apache.org/dyn/closer.lua/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
+wget https://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
 
-tar xzvf spark-3.3.2-bin-hadoop3.tgz
+tar xzvf spark-3.1.1-bin-hadoop3.2.tgz
 
-export SPARK_HOME=$PWD/spark-3.3.2-bin-hadoop3
+export SPARK_HOME=$PWD/spark-3.1.1-bin-hadoop3.2
  ```
 ### 1. Launch a spark-shell
 
@@ -164,12 +176,11 @@ Go to [QbeastTable documentation](./docs/QbeastTable.md) for more detailed infor
 Use [Python index visualizer](./utils/visualizer/README.md) for your indexed table to visually examine index structure and gather sampling metrics.
 
 # Dependencies and Version Compatibility
-| Version              | Spark | Hadoop | Delta Lake |
-|----------------------|:-----:|:------:|:----------:|
-| 0.1.0                | 3.0.0 | 3.2.0  |   0.8.0    |
-| 0.2.0                | 3.1.x | 3.2.0  |   1.0.0    |
-| 0.3.x                | 3.2.x | 3.3.x  |   1.2.x    |
-| 0.4.x (coming soon!) | 3.3.x | 3.3.x  |   2.1.x    |
+| Version | Spark | Hadoop | Delta Lake |
+|---------|:-----:|:------:|:----------:|
+| 0.1.0   | 3.0.0 | 3.2.0  |   0.8.0    |
+| 0.2.0   | 3.1.x | 3.2.0  |   1.0.0    |
+| 0.3.x   | 3.2.x | 3.3.x  |   1.2.x    |
 
 Check [here](https://docs.delta.io/latest/releases.html) for **Delta Lake** and **Apache Spark** version compatibility.  
 
diff --git a/project/Dependencies.scala b/project/Dependencies.scala
@@ -4,9 +4,9 @@ import sbt._
  * External libraries used in the project with versions.
  */
 object Dependencies {
-  lazy val sparkVersion: String = sys.props.get("spark.version").getOrElse("3.3.0")
-  lazy val hadoopVersion: String = sys.props.get("hadoop.version").getOrElse("3.3.4")
-  lazy val deltaVersion: String = "2.1.0"
+  lazy val sparkVersion: String = sys.props.get("spark.version").getOrElse("3.2.2")
+  lazy val hadoopVersion: String = sys.props.get("hadoop.version").getOrElse("3.3.1")
+  lazy val deltaVersion: String = "1.2.0"
 
   val sparkCore = "org.apache.spark" %% "spark-core" % sparkVersion
   val sparkSql = "org.apache.spark" %% "spark-sql" % sparkVersion
diff --git a/project/build.properties b/project/build.properties
@@ -1 +1 @@
-sbt.version = 1.6.2
+sbt.version = 1.5.8
diff --git a/src/main/scala/io/qbeast/spark/delta/OTreeIndex.scala b/src/main/scala/io/qbeast/spark/delta/OTreeIndex.scala
@@ -110,10 +110,6 @@ case class OTreeIndex(index: TahoeLogFileIndex) extends FileIndex with Logging {
   override def partitionSchema: StructType = index.partitionSchema
 }
 
-/**
- * Companion object for OTreeIndex
- * Builds an OTreeIndex instance from the path to a table
- */
 object OTreeIndex {
 
   def apply(spark: SparkSession, path: Path): OTreeIndex = {
@@ -123,24 +119,3 @@ object OTreeIndex {
   }
 
 }
-
-/**
- * Singleton object for EmptyIndex.
- * Used when creating a table with no data added
- */
-
-object EmptyIndex extends FileIndex {
-  override def rootPaths: Seq[Path] = Seq.empty
-
-  override def listFiles(
-      partitionFilters: Seq[Expression],
-      dataFilters: Seq[Expression]): Seq[PartitionDirectory] = Seq.empty
-
-  override def inputFiles: Array[String] = Array.empty
-
-  override def refresh(): Unit = {}
-
-  override def sizeInBytes: Long = 0L
-
-  override def partitionSchema: StructType = StructType(Seq.empty)
-}
diff --git a/src/main/scala/io/qbeast/spark/internal/rules/SaveAsTableRule.scala b/src/main/scala/io/qbeast/spark/internal/rules/SaveAsTableRule.scala
@@ -24,13 +24,15 @@ class SaveAsTableRule(spark: SparkSession) extends Rule[LogicalPlan] with Loggin
     // We need to pass the writeOptions as properties to the creation of the table
     // to make sure columnsToIndex is present
     plan transformDown {
-      case saveAsSelect: CreateTableAsSelect if isQbeastProvider(saveAsSelect.tableSpec) =>
-        val finalProperties = saveAsSelect.writeOptions ++ saveAsSelect.tableSpec.properties
-        saveAsSelect.copy(tableSpec = saveAsSelect.tableSpec.copy(properties = finalProperties))
-      case replaceAsSelect: ReplaceTableAsSelect if isQbeastProvider(replaceAsSelect.tableSpec) =>
-        val finalProperties = replaceAsSelect.tableSpec.properties ++ replaceAsSelect.writeOptions
-        replaceAsSelect.copy(tableSpec =
-          replaceAsSelect.tableSpec.copy(properties = finalProperties))
+      case saveAsSelect: CreateTableAsSelect if isQbeastProvider(saveAsSelect.properties) =>
+        val options = saveAsSelect.writeOptions
+        val finalProperties = saveAsSelect.properties ++ options
+        saveAsSelect.copy(properties = finalProperties)
+      case replaceAsSelect: ReplaceTableAsSelect
+          if isQbeastProvider(replaceAsSelect.properties) =>
+        val options = replaceAsSelect.writeOptions
+        val finalProperties = replaceAsSelect.properties ++ options
+        replaceAsSelect.copy(properties = finalProperties)
     }
   }
 
diff --git a/src/main/scala/io/qbeast/spark/internal/sources/QbeastBaseRelation.scala b/src/main/scala/io/qbeast/spark/internal/sources/QbeastBaseRelation.scala
@@ -9,7 +9,7 @@ import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.types.{StructField, StructType}
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.SparkSession
-import io.qbeast.spark.delta.{EmptyIndex, OTreeIndex}
+import io.qbeast.spark.delta.OTreeIndex
 import org.apache.spark.sql.execution.datasources.HadoopFsRelation
 import io.qbeast.spark.table.IndexedTable
 import io.qbeast.context.QbeastContext
@@ -45,7 +45,7 @@ object QbeastBaseRelation {
       // This could happen if we CREATE/REPLACE TABLE without inserting data
       // In this case, we use the options variable
       new HadoopFsRelation(
-        EmptyIndex,
+        OTreeIndex(spark, new Path(tableID.id)),
         partitionSchema = StructType(Seq.empty[StructField]),
         dataSchema = schema,
         bucketSpec = None,
diff --git a/src/main/scala/io/qbeast/spark/internal/sources/catalog/QbeastCatalog.scala b/src/main/scala/io/qbeast/spark/internal/sources/catalog/QbeastCatalog.scala
@@ -15,7 +15,6 @@ import org.apache.spark.sql.catalyst.analysis.{
 }
 import org.apache.spark.sql.{SparkCatalogUtils, SparkSession}
 import org.apache.spark.sql.connector.catalog._
-import org.apache.spark.sql.connector.catalog.functions.UnboundFunction
 import org.apache.spark.sql.connector.expressions.Transform
 import org.apache.spark.sql.delta.catalog.DeltaCatalog
 import org.apache.spark.sql.types.StructType
@@ -30,7 +29,7 @@ import scala.collection.JavaConverters._
  * QbeastCatalog uses a session catalog of type T
  * to delegate high-level operations
  */
-class QbeastCatalog[T <: TableCatalog with SupportsNamespaces with FunctionCatalog]
+class QbeastCatalog[T <: TableCatalog with SupportsNamespaces]
     extends CatalogExtension
     with SupportsNamespaces
     with StagingTableCatalog {
@@ -234,8 +233,8 @@ class QbeastCatalog[T <: TableCatalog with SupportsNamespaces with FunctionCatal
   override def alterNamespace(namespace: Array[String], changes: NamespaceChange*): Unit =
     getSessionCatalog().alterNamespace(namespace, changes.head)
 
-  override def dropNamespace(namespace: Array[String], cascade: Boolean): Boolean =
-    getSessionCatalog().dropNamespace(namespace, cascade)
+  override def dropNamespace(namespace: Array[String]): Boolean =
+    getSessionCatalog().dropNamespace(namespace)
 
   override def initialize(name: String, options: CaseInsensitiveStringMap): Unit = {
     // Initialize the catalog with the corresponding name
@@ -255,10 +254,4 @@ class QbeastCatalog[T <: TableCatalog with SupportsNamespaces with FunctionCatal
     } else throw new IllegalArgumentException("Invalid session catalog: " + delegate)
   }
 
-  override def listFunctions(namespace: Array[String]): Array[Identifier] =
-    getSessionCatalog().listFunctions(namespace)
-
-  override def loadFunction(ident: Identifier): UnboundFunction =
-    getSessionCatalog().loadFunction(ident)
-
 }
diff --git a/src/main/scala/io/qbeast/spark/internal/sources/catalog/QbeastCatalogUtils.scala b/src/main/scala/io/qbeast/spark/internal/sources/catalog/QbeastCatalogUtils.scala
@@ -11,7 +11,6 @@ import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.CannotReplaceMissingTableException
 import org.apache.spark.sql.catalyst.catalog._
-import org.apache.spark.sql.catalyst.plans.logical.TableSpec
 import org.apache.spark.sql.connector.catalog.{Identifier, Table}
 import org.apache.spark.sql.connector.expressions.Transform
 import org.apache.spark.sql.delta.DeltaLog
@@ -47,10 +46,6 @@ object QbeastCatalogUtils {
     provider.isDefined && provider.get == QBEAST_PROVIDER_NAME
   }
 
-  def isQbeastProvider(tableSpec: TableSpec): Boolean = {
-    tableSpec.provider.contains(QBEAST_PROVIDER_NAME)
-  }
-
   def isQbeastProvider(properties: Map[String, String]): Boolean = isQbeastProvider(
     properties.get("provider"))
 
diff --git a/src/test/scala/io/qbeast/spark/QbeastIntegrationTestSpec.scala b/src/test/scala/io/qbeast/spark/QbeastIntegrationTestSpec.scala
@@ -11,7 +11,7 @@ import io.qbeast.spark.delta.SparkDeltaMetadataManager
 import io.qbeast.spark.delta.writer.{SparkDeltaDataWriter}
 import io.qbeast.spark.index.{SparkOTreeManager, SparkRevisionFactory}
 import io.qbeast.spark.table.IndexedTableFactoryImpl
-import org.apache.log4j.{Level}
+import org.apache.log4j.{Level, Logger}
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.{DataFrame, SparkSession}
@@ -32,6 +32,8 @@ import java.nio.file.Files
  * }}}
  */
 trait QbeastIntegrationTestSpec extends AnyFlatSpec with Matchers with DatasetComparer {
+  // This reduce the verbosity of Spark
+  Logger.getLogger("org.apache").setLevel(Level.WARN)
 
   // Spark Configuration
   // Including Session Extensions and Catalog
@@ -78,7 +80,6 @@ trait QbeastIntegrationTestSpec extends AnyFlatSpec with Matchers with DatasetCo
       .appName("QbeastDataSource")
       .config(sparkConf)
       .getOrCreate()
-    spark.sparkContext.setLogLevel(Level.WARN.toString)
     try {
       testCode(spark)
     } finally {
diff --git a/src/test/scala/io/qbeast/spark/internal/sources/catalog/QbeastCatalogTest.scala b/src/test/scala/io/qbeast/spark/internal/sources/catalog/QbeastCatalogTest.scala
@@ -163,8 +163,7 @@ class QbeastCatalogTest extends QbeastIntegrationTestSpec with CatalogTestSuite
       val qbeastCatalog = createQbeastCatalog(spark)
       qbeastCatalog.loadNamespaceMetadata(defaultNamespace) shouldBe Map(
         "comment" -> "default database",
-        "location" -> ("file:" + tmpLocation),
-        "owner" -> scala.util.Properties.userName).asJava
+        "location" -> ("file:" + tmpLocation)).asJava
     })
 
   it should "alter namespace" in withQbeastContextSparkAndTmpWarehouse((spark, tmpLocation) => {
@@ -180,7 +179,6 @@ class QbeastCatalogTest extends QbeastIntegrationTestSpec with CatalogTestSuite
     qbeastCatalog.loadNamespaceMetadata(newNamespace) shouldBe Map(
       "comment" -> "",
       "location" -> ("file:" + tmpLocation + "/new_namespace.db"),
-      "owner" -> scala.util.Properties.userName,
       "newPropertie" -> "newValue").asJava
 
   })
@@ -193,7 +191,7 @@ class QbeastCatalogTest extends QbeastIntegrationTestSpec with CatalogTestSuite
     qbeastCatalog.listNamespaces() shouldBe Array(defaultNamespace, Array("new_namespace"))
 
     // Drop Namespace
-    qbeastCatalog.dropNamespace(newNamespace, true)
+    qbeastCatalog.dropNamespace(newNamespace)
 
     qbeastCatalog.listNamespaces() shouldBe Array(defaultNamespace)
 
diff --git a/src/test/scala/io/qbeast/spark/utils/ConvertToQbeastTest.scala b/src/test/scala/io/qbeast/spark/utils/ConvertToQbeastTest.scala
@@ -63,7 +63,7 @@ class ConvertToQbeastTest
     val sourceDf = spark.read.format(fileFormat).load(tmpDir)
     val qbeastDf = spark.read.format("qbeast").load(tmpDir)
 
-    assertLargeDatasetEquality(qbeastDf, sourceDf, orderedComparison = false)
+    assertLargeDatasetEquality(qbeastDf, sourceDf)
 
     // All non-qbeast files are considered staging files and are placed
     // directly into the staging revision(RevisionID = 0)
@@ -83,7 +83,7 @@ class ConvertToQbeastTest
     val sourceDf = spark.read.format(fileFormat).load(tmpDir)
     val qbeastDf = spark.read.format("qbeast").load(tmpDir)
 
-    assertLargeDatasetEquality(qbeastDf, sourceDf, orderedComparison = false)
+    assertLargeDatasetEquality(qbeastDf, sourceDf)
 
     // All non-qbeast files are considered staging files and are placed
     // directly into the staging revision(RevisionID = 0)
@@ -214,7 +214,7 @@ class ConvertToQbeastTest
       // Compare DataFrames
       val sourceDf = spark.read.format(fileFormat).load(tmpDir)
       val qbeastDf = spark.read.format("qbeast").load(tmpDir)
-      assertLargeDatasetEquality(qbeastDf, sourceDf, orderedComparison = false)
+      assertLargeDatasetEquality(qbeastDf, sourceDf)
     })
 
   "Compacting the staging revision" should "reduce the number of delta AddFiles" in
@@ -233,7 +233,7 @@ class ConvertToQbeastTest
         // Compare DataFrames
         val sourceDf = spark.read.format(fileFormat).load(tmpDir)
         val qbeastDf = spark.read.format("qbeast").load(tmpDir)
-        assertLargeDatasetEquality(qbeastDf, sourceDf, orderedComparison = false)
+        assertLargeDatasetEquality(qbeastDf, sourceDf)
 
         // Standard staging revision behavior
         val qs = getQbeastSnapshot(spark, tmpDir)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-sbt.version = 1.6.2`
	`1`	`+sbt.version = 1.5.8`