Skip to content

Commit e79b0da

Browse files
Merge pull request #614 from databrickslabs/dbr17/ogr-readers
OGR Named Readers
2 parents 125d618 + 07c13b1 commit e79b0da

File tree

13 files changed

+187
-8
lines changed

13 files changed

+187
-8
lines changed

scripts/docker/README-DOCKER.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
For tests to run in docker we need to copy the jar to site-packages/pyspark/jars.
2-
Example: /root/mosaic/python/dev/lib/python3.12/site-packages/pyspark/jars/
2+
Example: /root/geobrix/python/dev/lib/python3.12/site-packages/pyspark/jars/
33
spark.addArtifact does not work due to permissions issues.
44
Also there could be annoying warnings about jypiter_client which isnt something relevant to the pytest we do.
55
If those warnings occur use: export JUPYTER_PLATFORM_DIRS=1
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
com.databricks.labs.gbx.rasterx.ds.gdal.GDAL_DataSource
22
com.databricks.labs.gbx.vectorx.ds.ogr.OGR_DataSource
33
com.databricks.labs.gbx.vectorx.ds.shp.ShapeFile_DataSource
4+
com.databricks.labs.gbx.vectorx.ds.gdb.FileGDB_DataSource
5+
com.databricks.labs.gbx.vectorx.ds.geojson.GeoJSON_DataSource
6+
com.databricks.labs.gbx.vectorx.ds.gpkg.GPKG_DataSource
47
com.databricks.labs.gbx.ds.register.RegisterDataSource
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package com.databricks.labs.gbx.ds
2+
3+
import org.apache.spark.sql.util.CaseInsensitiveStringMap
4+
5+
import scala.jdk.CollectionConverters._
6+
7+
trait DataSourceExtras {
8+
9+
def dsExtraMap(checkMap: Map[String, String] = Map.empty): Map[String, String]
10+
11+
def extraJavaUtilMap(properties: java.util.Map[String, String]): java.util.Map[String, String] = {
12+
val cMap = properties.asScala.toMap
13+
val newMap = cMap ++ dsExtraMap(checkMap = cMap)
14+
newMap.asJava
15+
}
16+
17+
def extraCaseInsensitiveStringMap(options: CaseInsensitiveStringMap): CaseInsensitiveStringMap = {
18+
val cMap = options.asCaseSensitiveMap().asScala.toMap
19+
val newMap = cMap ++ dsExtraMap(checkMap = cMap)
20+
new CaseInsensitiveStringMap(newMap.asJava)
21+
}
22+
23+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package com.databricks.labs.gbx.vectorx.ds.gdb
2+
3+
import com.databricks.labs.gbx.ds.DataSourceExtras
4+
import com.databricks.labs.gbx.vectorx.ds.ogr.OGR_DataSource
5+
import org.apache.spark.sql.connector.catalog.Table
6+
import org.apache.spark.sql.connector.expressions.Transform
7+
import org.apache.spark.sql.types.StructType
8+
import org.apache.spark.sql.util.CaseInsensitiveStringMap
9+
10+
//noinspection ScalaUnusedSymbol
11+
class FileGDB_DataSource extends OGR_DataSource with DataSourceExtras {
12+
13+
override def dsExtraMap(checkMap: Map[String, String] = Map.empty): Map[String, String] = Map(
14+
"driverName" -> "OpenFileGDB"
15+
)
16+
17+
override def shortName(): String = "file_gdb"
18+
19+
override def inferSchema(options: CaseInsensitiveStringMap): StructType = {
20+
super.inferSchema(extraCaseInsensitiveStringMap(options))
21+
}
22+
23+
override def getTable(schema: StructType, partitions: Array[Transform], properties: java.util.Map[String, String]): Table = {
24+
super.getTable(schema, partitions, extraJavaUtilMap(properties))
25+
}
26+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package com.databricks.labs.gbx.vectorx.ds.geojson
2+
3+
import com.databricks.labs.gbx.ds.DataSourceExtras
4+
import com.databricks.labs.gbx.vectorx.ds.ogr.OGR_DataSource
5+
import org.apache.spark.sql.connector.catalog.Table
6+
import org.apache.spark.sql.connector.expressions.Transform
7+
import org.apache.spark.sql.types.StructType
8+
import org.apache.spark.sql.util.CaseInsensitiveStringMap
9+
10+
11+
//noinspection ScalaUnusedSymbol
12+
class GeoJSON_DataSource extends OGR_DataSource with DataSourceExtras{
13+
14+
// default to multi = true given common use
15+
override def dsExtraMap(checkMap: Map[String, String] = Map.empty): Map[String, String] = {
16+
if (checkMap.getOrElse("multi", "true").toBoolean) {
17+
Map("driverName" -> "GeoJSONSeq")
18+
} else {
19+
Map("driverName" -> "GeoJSON")
20+
}
21+
}
22+
23+
override def shortName(): String = "geojson"
24+
25+
override def inferSchema(options: CaseInsensitiveStringMap): StructType = {
26+
super.inferSchema(extraCaseInsensitiveStringMap(options))
27+
}
28+
29+
override def getTable(schema: StructType, partitions: Array[Transform], properties: java.util.Map[String, String]): Table = {
30+
super.getTable(schema, partitions, extraJavaUtilMap(properties))
31+
}
32+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package com.databricks.labs.gbx.vectorx.ds.gpkg
2+
3+
import com.databricks.labs.gbx.ds.DataSourceExtras
4+
import com.databricks.labs.gbx.vectorx.ds.ogr.OGR_DataSource
5+
import org.apache.spark.sql.connector.catalog.Table
6+
import org.apache.spark.sql.connector.expressions.Transform
7+
import org.apache.spark.sql.types.StructType
8+
import org.apache.spark.sql.util.CaseInsensitiveStringMap
9+
10+
11+
//noinspection ScalaUnusedSymbol
12+
class GPKG_DataSource extends OGR_DataSource with DataSourceExtras{
13+
14+
override def dsExtraMap(checkMap: Map[String, String] = Map.empty): Map[String, String] = Map(
15+
"driverName" -> "GPKG"
16+
)
17+
18+
override def shortName(): String = "ogr_gpkg"
19+
20+
override def inferSchema(options: CaseInsensitiveStringMap): StructType = {
21+
super.inferSchema(extraCaseInsensitiveStringMap(options))
22+
}
23+
24+
override def getTable(schema: StructType, partitions: Array[Transform], properties: java.util.Map[String, String]): Table = {
25+
super.getTable(schema, partitions, extraJavaUtilMap(properties))
26+
}
27+
}

src/main/scala/com/databricks/labs/gbx/vectorx/ds/ogr/OGR_Batch.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ class OGR_Batch(schema: StructType, options: Map[String, String]) extends Scan w
1515

1616
override def planInputPartitions(): Array[InputPartition] = {
1717
val inPath = options("path")
18-
val chunkSize = options("chunkSize").toInt
18+
val chunkSize = options.getOrElse("chunkSize", "10000").toInt
1919
val driverName = options.getOrElse("driverName", "")
2020
val layerN = options.getOrElse("layerNumber", "0").toInt
2121
val layerName = options.getOrElse("layerName", "")
Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,27 @@
11
package com.databricks.labs.gbx.vectorx.ds.shp
22

3+
import com.databricks.labs.gbx.ds.DataSourceExtras
34
import com.databricks.labs.gbx.vectorx.ds.ogr.OGR_DataSource
5+
import org.apache.spark.sql.connector.catalog.Table
6+
import org.apache.spark.sql.connector.expressions.Transform
7+
import org.apache.spark.sql.types.StructType
8+
import org.apache.spark.sql.util.CaseInsensitiveStringMap
9+
410

511
//noinspection ScalaUnusedSymbol
6-
class ShapeFile_DataSource extends OGR_DataSource {
12+
class ShapeFile_DataSource extends OGR_DataSource with DataSourceExtras{
13+
14+
override def dsExtraMap(checkMap: Map[String, String] = Map.empty): Map[String, String] = Map(
15+
"driverName" -> "ESRI Shapefile"
16+
)
717

818
override def shortName(): String = "shapefile"
919

20+
override def inferSchema(options: CaseInsensitiveStringMap): StructType = {
21+
super.inferSchema(extraCaseInsensitiveStringMap(options))
22+
}
23+
24+
override def getTable(schema: StructType, partitions: Array[Transform], properties: java.util.Map[String, String]): Table = {
25+
super.getTable(schema, partitions, extraJavaUtilMap(properties))
26+
}
1027
}
Binary file not shown.

0 commit comments

Comments
 (0)