Skip to content

Commit a3d9f62

Browse files
adriangbclaudeCopilot
authored
Mark JSON-bearing string fields with is_json metadata (#111)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 3751908 commit a3d9f62

4 files changed

Lines changed: 67 additions & 18 deletions

File tree

src/common_union.rs

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,14 @@ use datafusion::arrow::datatypes::{DataType, Field, UnionFields, UnionMode};
99
use datafusion::arrow::error::ArrowError;
1010
use datafusion::common::ScalarValue;
1111

12+
/// Field metadata used to mark a `Utf8` field as containing raw JSON.
13+
///
14+
/// Attach this to any Arrow `Field` whose values are JSON-encoded strings so
15+
/// downstream consumers can recognize them as JSON rather than opaque text.
16+
pub fn json_field_metadata() -> HashMap<String, String> {
17+
HashMap::from([("is_json".to_string(), "true".to_string())])
18+
}
19+
1220
pub fn is_json_union(data_type: &DataType) -> bool {
1321
match data_type {
1422
DataType::Union(fields, UnionMode::Sparse) => fields == &union_fields(),
@@ -161,8 +169,6 @@ fn union_fields() -> UnionFields {
161169
static FIELDS: OnceLock<UnionFields> = OnceLock::new();
162170
FIELDS
163171
.get_or_init(|| {
164-
let json_metadata: HashMap<String, String> =
165-
HashMap::from_iter(vec![("is_json".to_string(), "true".to_string())]);
166172
UnionFields::from_iter([
167173
(TYPE_ID_NULL, Arc::new(Field::new("null", DataType::Null, true))),
168174
(TYPE_ID_BOOL, Arc::new(Field::new("bool", DataType::Boolean, false))),
@@ -171,11 +177,11 @@ fn union_fields() -> UnionFields {
171177
(TYPE_ID_STR, Arc::new(Field::new("str", DataType::Utf8, false))),
172178
(
173179
TYPE_ID_ARRAY,
174-
Arc::new(Field::new("array", DataType::Utf8, false).with_metadata(json_metadata.clone())),
180+
Arc::new(Field::new("array", DataType::Utf8, false).with_metadata(json_field_metadata())),
175181
),
176182
(
177183
TYPE_ID_OBJECT,
178-
Arc::new(Field::new("object", DataType::Utf8, false).with_metadata(json_metadata.clone())),
184+
Arc::new(Field::new("object", DataType::Utf8, false).with_metadata(json_field_metadata())),
179185
),
180186
])
181187
})

src/json_get_array.rs

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,18 @@ use std::any::Any;
22
use std::sync::Arc;
33

44
use datafusion::arrow::array::{ArrayRef, ListBuilder, StringBuilder};
5-
use datafusion::arrow::datatypes::DataType;
5+
use datafusion::arrow::datatypes::{DataType, Field};
66
use datafusion::common::{Result as DataFusionResult, ScalarValue};
77
use datafusion::logical_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility};
88
use jiter::Peek;
99

1010
use crate::common::{get_err, invoke, jiter_json_find, return_type_check, GetError, InvokeResult, JsonPath};
1111
use crate::common_macros::make_udf_function;
12+
use crate::common_union::json_field_metadata;
13+
14+
fn list_item_field() -> Field {
15+
Field::new("item", DataType::Utf8, true).with_metadata(json_field_metadata())
16+
}
1217

1318
make_udf_function!(
1419
JsonGetArray,
@@ -46,15 +51,7 @@ impl ScalarUDFImpl for JsonGetArray {
4651
}
4752

4853
fn return_type(&self, arg_types: &[DataType]) -> DataFusionResult<DataType> {
49-
return_type_check(
50-
arg_types,
51-
self.name(),
52-
DataType::List(Arc::new(datafusion::arrow::datatypes::Field::new(
53-
"item",
54-
DataType::Utf8,
55-
true,
56-
))),
57-
)
54+
return_type_check(arg_types, self.name(), DataType::List(Arc::new(list_item_field())))
5855
}
5956

6057
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> DataFusionResult<ColumnarValue> {
@@ -96,7 +93,7 @@ impl InvokeResult for BuildArrayList {
9693

9794
fn builder(capacity: usize) -> Self::Builder {
9895
let values_builder = StringBuilder::new();
99-
ListBuilder::with_capacity(values_builder, capacity)
96+
ListBuilder::with_capacity(values_builder, capacity).with_field(list_item_field())
10097
}
10198

10299
fn append_value(builder: &mut Self::Builder, value: Option<Self::Item>) {
@@ -108,7 +105,7 @@ impl InvokeResult for BuildArrayList {
108105
}
109106

110107
fn scalar(value: Option<Self::Item>) -> ScalarValue {
111-
let mut builder = ListBuilder::new(StringBuilder::new());
108+
let mut builder = ListBuilder::new(StringBuilder::new()).with_field(list_item_field());
112109

113110
if let Some(array_items) = value {
114111
for item in array_items {

src/json_get_json.rs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
use std::any::Any;
2+
use std::sync::Arc;
23

34
use datafusion::arrow::array::StringArray;
4-
use datafusion::arrow::datatypes::DataType;
5+
use datafusion::arrow::datatypes::{DataType, Field, FieldRef};
56
use datafusion::common::Result as DataFusionResult;
6-
use datafusion::logical_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility};
7+
use datafusion::logical_expr::{
8+
ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
9+
};
710

811
use crate::common::{get_err, invoke, jiter_json_find, return_type_check, GetError, JsonPath};
912
use crate::common_macros::make_udf_function;
13+
use crate::common_union::json_field_metadata;
1014

1115
make_udf_function!(
1216
JsonGetJson,
@@ -47,6 +51,14 @@ impl ScalarUDFImpl for JsonGetJson {
4751
return_type_check(arg_types, self.name(), DataType::Utf8)
4852
}
4953

54+
fn return_field_from_args(&self, args: ReturnFieldArgs) -> DataFusionResult<FieldRef> {
55+
let arg_types: Vec<DataType> = args.arg_fields.iter().map(|f| f.data_type().clone()).collect();
56+
let return_type = self.return_type(&arg_types)?;
57+
Ok(Arc::new(
58+
Field::new(self.name(), return_type, true).with_metadata(json_field_metadata()),
59+
))
60+
}
61+
5062
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> DataFusionResult<ColumnarValue> {
5163
invoke::<StringArray>(&args.args, jiter_json_get_json)
5264
}

tests/main.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,31 @@ async fn test_json_get_array_with_path() {
161161
assert_eq!(value_repr, "[1, 2, 3]");
162162
}
163163

164+
#[tokio::test]
165+
async fn test_json_get_array_inner_field_is_json_metadata() {
166+
let sql = r#"select json_get_array('[{"a": 1}, {"b": 2}]') as v"#;
167+
let batches = run_query(sql).await.unwrap();
168+
let schema = batches[0].schema();
169+
let field = schema.field(0);
170+
let DataType::List(inner_field) = field.data_type() else {
171+
panic!("expected List, got {:?}", field.data_type());
172+
};
173+
assert_eq!(inner_field.metadata().get("is_json").map(String::as_str), Some("true"));
174+
175+
let array_field = batches[0]
176+
.column(0)
177+
.as_any()
178+
.downcast_ref::<datafusion::arrow::array::ListArray>()
179+
.unwrap();
180+
let DataType::List(produced_inner) = array_field.data_type() else {
181+
panic!("expected List in produced array");
182+
};
183+
assert_eq!(
184+
produced_inner.metadata().get("is_json").map(String::as_str),
185+
Some("true")
186+
);
187+
}
188+
164189
#[tokio::test]
165190
async fn test_json_get_equals() {
166191
let e = run_query(r"select name, json_get(json_data, 'foo')='abc' from test")
@@ -411,6 +436,15 @@ async fn test_json_get_json_float() {
411436
assert_eq!(display_val(batches).await, (DataType::Utf8, "4.2e-1".to_string()));
412437
}
413438

439+
#[tokio::test]
440+
async fn test_json_get_json_is_json_metadata() {
441+
let sql = r#"select json_get_json('{"x": [1, 2]}', 'x') as v"#;
442+
let batches = run_query(sql).await.unwrap();
443+
let schema = batches[0].schema();
444+
let field = schema.field(0);
445+
assert_eq!(field.metadata().get("is_json").map(String::as_str), Some("true"));
446+
}
447+
414448
#[tokio::test]
415449
async fn test_json_length_array() {
416450
let sql = "select json_length('[1, 2, 3]')";

0 commit comments

Comments
 (0)