fix: accept GeoJSON strings for Edm.GeographyPoint in AzureSearchWriter (#2420)

chon3806 · chon3806 · commit 201f1372e69c · 2026-04-25T15:17:27.000-04:00
Azure AI Search expects spatial values as GeoJSON objects, but when users
supplied a StringType column the writer JSON-escaped the entire string and
the service rejected the request with HTTP 400. Convert string GeographyPoint
columns into the canonical struct&lt;type, coordinates&gt; shape via from_json
before serialization, mirroring the existing Edm.DateTimeOffset handling.
Existing struct-based input is unchanged.
diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/search/AzureSearch.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/search/AzureSearch.scala
@@ -20,7 +20,8 @@ import org.apache.spark.ml.util._
 import org.apache.spark.ml.{ComplexParamsReadable, NamespaceInjections, PipelineModel}
 import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
 import org.apache.spark.ml.functions.vector_to_array
-import org.apache.spark.sql.functions.{col, expr, struct, to_json, to_utc_timestamp, date_format, when}
+import org.apache.spark.sql.functions.{col, expr, from_json, struct, to_json, to_utc_timestamp,
+  date_format, when}
 import org.apache.spark.sql.streaming.DataStreamWriter
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
@@ -249,6 +250,50 @@ object AzureSearchWriter extends IndexParser with IndexJsonGetter with SLogging
     }
   }
 
+  /**
+   * Converts string columns containing GeoJSON to the proper struct shape required for
+   * Azure Search `Edm.GeographyPoint` fields.
+   *
+   * Azure AI Search expects spatial values to be sent as a GeoJSON object
+   * (e.g. `{"type":"Point","coordinates":[lon, lat]}`), not as a JSON-encoded string.
+   * Users frequently have their GeoJSON readily available as a string column, and
+   * passing it as a `StringType` previously caused a `400 Bad Request`
+   * (see issue #2420) because the writer JSON-escaped the entire string.
+   *
+   * For each field declared as `Edm.GeographyPoint` in the index, if the corresponding
+   * DataFrame column is a `StringType`, parse it into the canonical
+   * `struct<type:string, coordinates:array<double>>` so that downstream `to_json`
+   * emits a proper GeoJSON object. Columns that are already structured are left as-is.
+   *
+   * @param df DataFrame with potential GeographyPoint columns
+   * @param indexJson JSON string containing the index schema
+   * @return DataFrame with string GeographyPoint columns converted to GeoJSON structs
+   */
+  private def convertGeographyPointToStruct(df: DataFrame, indexJson: String): DataFrame = {
+    val geoStructType = StructType(Seq(
+      StructField("type", StringType),
+      StructField("coordinates", ArrayType(DoubleType))
+    ))
+    val geoFields = parseIndexJson(indexJson).fields
+      .filter(_.`type` == "Edm.GeographyPoint")
+      .map(_.name)
+    geoFields.foldLeft(df) { (currentDF, fieldName) =>
+      if (currentDF.columns.contains(fieldName)) {
+        currentDF.schema(fieldName).dataType match {
+          case StringType =>
+            currentDF.withColumn(fieldName,
+              when(col(fieldName).isNotNull, from_json(col(fieldName), geoStructType))
+            )
+          case _ =>
+            // Already a struct (or otherwise compatible) — let checkSchemaParity validate it.
+            currentDF
+        }
+      } else {
+        currentDF
+      }
+    }
+  }
+
   private def dfToIndexJson(schema: StructType,
                             indexName: String,
                             keyCol: String,
@@ -328,17 +373,18 @@ object AzureSearchWriter extends IndexParser with IndexJsonGetter with SLogging
 
     SearchIndex.createIfNoneExists(subscriptionKey, serviceName, indexJson, apiVersion)
     val dateConvertedDF = convertDateTimeToISO8601(preppedDF, indexJson)
+    val geoConvertedDF = convertGeographyPointToStruct(dateConvertedDF, indexJson)
 
     logInfo("checking schema parity")
-    checkSchemaParity(dateConvertedDF.schema, indexJson, actionCol)
+    checkSchemaParity(geoConvertedDF.schema, indexJson, actionCol)
 
     val df1 = if (filterNulls) {
       val collectionColumns = parseIndexJson(indexJson).fields
         .filter(_.`type`.startsWith("Collection"))
         .map(_.name)
-      collectionColumns.foldLeft(dateConvertedDF) { (ndf, c) => filterOutNulls(ndf, c) }
+      collectionColumns.foldLeft(geoConvertedDF) { (ndf, c) => filterOutNulls(ndf, c) }
     } else {
-      dateConvertedDF
+      geoConvertedDF
     }
 
     // Convert date/timestamp columns to ISO8601 strings for Azure Search
diff --git a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/search/split2/SearchWriterSuitePart2.scala b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/search/split2/SearchWriterSuitePart2.scala
@@ -171,4 +171,37 @@ class SearchWriterSuite extends SearchWriterSuiteUtilities {
 
   }
 
+  test("Handle GeoJSON GeographyPoint fields supplied as strings") {
+
+    val in = generateIndexName()
+    val df = spark.createDataFrame(Seq(
+      ("upload", "0", """{"type":"Point","coordinates":[-122.3493, 47.6205]}"""),
+      ("upload", "1", """{"type":"Point","coordinates":[-122.3351, 47.6080]}""")
+    )).toDF("searchAction", "id", "location")
+
+    val indexJson =
+      s"""
+         |{
+         |  "name": "$in",
+         |  "fields": [
+         |    { "name": "id", "type": "Edm.String", "key": true, "searchable": true, "retrievable": true },
+         |    { "name": "location", "type": "Edm.GeographyPoint", "searchable": false,
+         |     "filterable": true, "retrievable": true, "sortable": true }
+         |  ]
+         |}
+         |""".stripMargin
+
+    AzureSearchWriter.write(df,
+      Map(
+        "subscriptionKey" -> azureSearchKey,
+        "actionCol" -> "searchAction",
+        "serviceName" -> testServiceName,
+        "indexJson" -> indexJson
+      )
+    )
+
+    retryWithBackoff(assertSize(in, 2))
+
+  }
+
 }