3
3
*/
4
4
package io .qbeast .spark .internal .sources
5
5
6
- import io .qbeast .core .model .{QTableID }
7
6
import org .apache .spark .sql .sources .BaseRelation
8
7
import org .apache .spark .sql .sources .InsertableRelation
9
-
10
- import org .apache .spark .sql .{SQLContext }
11
- import org .apache .spark .sql .types .{StructType , StructField }
8
+ import org .apache .spark .sql .SQLContext
9
+ import org .apache .spark .sql .types .{StructField , StructType }
12
10
import org .apache .spark .sql .DataFrame
13
11
import org .apache .spark .sql .SparkSession
14
12
import io .qbeast .spark .delta .OTreeIndex
15
13
import org .apache .spark .sql .execution .datasources .HadoopFsRelation
16
14
import io .qbeast .spark .table .IndexedTable
17
15
import io .qbeast .context .QbeastContext
18
- import org .apache .hadoop .fs .{ Path }
19
- import org .apache .spark .sql .catalyst .catalog .{ BucketSpec }
16
+ import org .apache .hadoop .fs .Path
17
+ import org .apache .spark .sql .catalyst .catalog .BucketSpec
20
18
import org .apache .spark .sql .execution .datasources .parquet .ParquetFileFormat
21
19
22
20
/**
23
21
* Companion object for QbeastBaseRelation
24
22
*/
25
23
object QbeastBaseRelation {
26
24
27
- /**
28
- * Creates a QbeastBaseRelation instance
29
- * @param tableID the identifier of the table
30
- * @return the QbeastBaseRelation
31
- */
32
-
33
25
/**
34
26
* Returns a HadoopFsRelation that contains all of the data present
35
27
* in the table. This relation will be continually updated
@@ -39,48 +31,72 @@ object QbeastBaseRelation {
39
31
* @param sqlContext the SQLContext
40
32
* @return the HadoopFsRelation
41
33
*/
42
- def createRelation (sqlContext : SQLContext , table : IndexedTable ): BaseRelation = {
34
+ def createRelation (
35
+ sqlContext : SQLContext ,
36
+ table : IndexedTable ,
37
+ options : Map [String , String ]): BaseRelation = {
43
38
44
39
val spark = SparkSession .active
45
40
val tableID = table.tableID
46
41
val snapshot = QbeastContext .metadataManager.loadSnapshot(tableID)
47
42
val schema = QbeastContext .metadataManager.loadCurrentSchema(tableID)
48
- val revision = snapshot.loadLatestRevision
49
- val columnsToIndex = revision.columnTransformers.map(row => row.columnName).mkString(" ," )
50
- val cubeSize = revision.desiredCubeSize
51
- val parameters =
52
- Map [String , String ](" columnsToIndex" -> columnsToIndex, " cubeSize" -> cubeSize.toString())
43
+ if (snapshot.isInitial) {
44
+ // If the Table is initial, read empty relation
45
+ // This could happen if we CREATE/REPLACE TABLE without inserting data
46
+ // In this case, we use the options variable
47
+ new HadoopFsRelation (
48
+ OTreeIndex (spark, new Path (tableID.id)),
49
+ partitionSchema = StructType (Seq .empty[StructField ]),
50
+ dataSchema = schema,
51
+ bucketSpec = None ,
52
+ new ParquetFileFormat (),
53
+ options)(spark) with InsertableRelation {
54
+ def insert (data : DataFrame , overwrite : Boolean ): Unit = {
55
+ table.save(data, options, append = ! overwrite)
56
+ }
57
+ }
58
+ } else {
59
+ // If the table contains data, initialize it
60
+ val revision = snapshot.loadLatestRevision
61
+ val columnsToIndex = revision.columnTransformers.map(row => row.columnName).mkString(" ," )
62
+ val cubeSize = revision.desiredCubeSize
63
+ val parameters =
64
+ Map [String , String ](" columnsToIndex" -> columnsToIndex, " cubeSize" -> cubeSize.toString())
53
65
54
- val path = new Path (tableID.id)
55
- val fileIndex = OTreeIndex (spark, path)
56
- val bucketSpec : Option [BucketSpec ] = None
57
- val file = new ParquetFileFormat ()
66
+ val path = new Path (tableID.id)
67
+ val fileIndex = OTreeIndex (spark, path)
68
+ val bucketSpec : Option [BucketSpec ] = None
69
+ val file = new ParquetFileFormat ()
58
70
59
- new HadoopFsRelation (
60
- fileIndex,
61
- partitionSchema = StructType (Seq .empty[StructField ]),
62
- dataSchema = schema,
63
- bucketSpec = bucketSpec,
64
- file,
65
- parameters)(spark) with InsertableRelation {
66
- def insert (data : DataFrame , overwrite : Boolean ): Unit = {
67
- table.save(data, parameters, append = ! overwrite)
71
+ new HadoopFsRelation (
72
+ fileIndex,
73
+ partitionSchema = StructType (Seq .empty[StructField ]),
74
+ dataSchema = schema,
75
+ bucketSpec = bucketSpec,
76
+ file,
77
+ parameters)(spark) with InsertableRelation {
78
+ def insert (data : DataFrame , overwrite : Boolean ): Unit = {
79
+ table.save(data, parameters, append = ! overwrite)
80
+ }
68
81
}
69
82
}
70
83
}
71
84
72
85
/**
73
86
* Function that can be called from a QbeastBaseRelation object to create a
74
87
* new QbeastBaseRelation with a new tableID.
75
- * @param tableID the identifier of the table
76
88
* @param indexedTable the indexed table
77
89
* @return BaseRelation for the new table in Qbeast format
78
90
*/
79
- def forQbeastTable (tableID : QTableID , indexedTable : IndexedTable ): BaseRelation = {
91
+ def forQbeastTable (indexedTable : IndexedTable ): BaseRelation = {
92
+ forQbeastTableWithOptions(indexedTable, Map .empty)
93
+ }
80
94
95
+ def forQbeastTableWithOptions (
96
+ indexedTable : IndexedTable ,
97
+ withOptions : Map [String , String ]): BaseRelation = {
81
98
val spark = SparkSession .active
82
- createRelation(spark.sqlContext, indexedTable)
83
-
99
+ createRelation(spark.sqlContext, indexedTable, withOptions)
84
100
}
85
101
86
102
}
0 commit comments