Skip to content

Commit 625b1df

Browse files
adriangbclaude
andcommitted
Also emit canonical Arrow JSON extension metadata keys
Arrow defines a canonical extension type for JSON (https://arrow.apache.org/docs/format/CanonicalExtensions.html#json): ARROW:extension:name = arrow.json ARROW:extension:metadata = {} Emit those alongside the existing is_json=true key so JSON-bearing string fields are recognized by the broader Arrow ecosystem (arrow-rs's Json extension type, pyarrow, DuckDB, Polars, etc.) while remaining back-compatible with consumers keyed on is_json. The change is contained to json_field_metadata(); all four production write sites already route through that helper. Detection via is_json_union remains purely structural and is unaffected. Tests are extended to assert all three keys, and the helper is re-exported at the crate root so test fixtures can use it instead of duplicating literals. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent a3d9f62 commit 625b1df

3 files changed

Lines changed: 30 additions & 12 deletions

File tree

src/common_union.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,18 @@ use datafusion::common::ScalarValue;
1313
///
1414
/// Attach this to any Arrow `Field` whose values are JSON-encoded strings so
1515
/// downstream consumers can recognize them as JSON rather than opaque text.
16+
///
17+
/// Emits both the legacy `is_json` key (for back-compat with existing
18+
/// consumers of this crate) and Arrow's canonical JSON extension type keys
19+
/// (`ARROW:extension:name` = `arrow.json`, `ARROW:extension:metadata` = `{}`),
20+
/// see <https://arrow.apache.org/docs/format/CanonicalExtensions.html#json>.
21+
#[must_use]
1622
pub fn json_field_metadata() -> HashMap<String, String> {
17-
HashMap::from([("is_json".to_string(), "true".to_string())])
23+
HashMap::from([
24+
("is_json".to_string(), "true".to_string()),
25+
("ARROW:extension:name".to_string(), "arrow.json".to_string()),
26+
("ARROW:extension:metadata".to_string(), "{}".to_string()),
27+
])
1828
}
1929

2030
pub fn is_json_union(data_type: &DataType) -> bool {

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ mod json_length;
2222
mod json_object_keys;
2323
mod rewrite;
2424

25-
pub use common_union::{JsonUnionEncoder, JsonUnionValue, JSON_UNION_DATA_TYPE};
25+
pub use common_union::{json_field_metadata, JsonUnionEncoder, JsonUnionValue, JSON_UNION_DATA_TYPE};
2626

2727
pub mod functions {
2828
pub use crate::json_as_text::json_as_text;

tests/main.rs

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ use datafusion::common::ScalarValue;
99
use datafusion::config::ConfigOptions;
1010
use datafusion::logical_expr::{ColumnarValue, ScalarFunctionArgs};
1111
use datafusion::prelude::SessionContext;
12+
use datafusion_functions_json::json_field_metadata;
1213
use datafusion_functions_json::udfs::json_get_str_udf;
1314
use utils::{create_context, display_val, logical_plan, run_query, run_query_params};
1415

@@ -162,15 +163,15 @@ async fn test_json_get_array_with_path() {
162163
}
163164

164165
#[tokio::test]
165-
async fn test_json_get_array_inner_field_is_json_metadata() {
166+
async fn test_json_get_array_inner_field_json_metadata() {
166167
let sql = r#"select json_get_array('[{"a": 1}, {"b": 2}]') as v"#;
167168
let batches = run_query(sql).await.unwrap();
168169
let schema = batches[0].schema();
169170
let field = schema.field(0);
170171
let DataType::List(inner_field) = field.data_type() else {
171172
panic!("expected List, got {:?}", field.data_type());
172173
};
173-
assert_eq!(inner_field.metadata().get("is_json").map(String::as_str), Some("true"));
174+
assert_json_field_metadata(inner_field.metadata());
174175

175176
let array_field = batches[0]
176177
.column(0)
@@ -180,9 +181,18 @@ async fn test_json_get_array_inner_field_is_json_metadata() {
180181
let DataType::List(produced_inner) = array_field.data_type() else {
181182
panic!("expected List in produced array");
182183
};
184+
assert_json_field_metadata(produced_inner.metadata());
185+
}
186+
187+
fn assert_json_field_metadata(metadata: &HashMap<String, String>) {
188+
assert_eq!(metadata.get("is_json").map(String::as_str), Some("true"));
189+
assert_eq!(
190+
metadata.get("ARROW:extension:name").map(String::as_str),
191+
Some("arrow.json")
192+
);
183193
assert_eq!(
184-
produced_inner.metadata().get("is_json").map(String::as_str),
185-
Some("true")
194+
metadata.get("ARROW:extension:metadata").map(String::as_str),
195+
Some("{}")
186196
);
187197
}
188198

@@ -437,12 +447,12 @@ async fn test_json_get_json_float() {
437447
}
438448

439449
#[tokio::test]
440-
async fn test_json_get_json_is_json_metadata() {
450+
async fn test_json_get_json_json_metadata() {
441451
let sql = r#"select json_get_json('{"x": [1, 2]}', 'x') as v"#;
442452
let batches = run_query(sql).await.unwrap();
443453
let schema = batches[0].schema();
444454
let field = schema.field(0);
445-
assert_eq!(field.metadata().get("is_json").map(String::as_str), Some("true"));
455+
assert_json_field_metadata(field.metadata());
446456
}
447457

448458
#[tokio::test]
@@ -632,8 +642,7 @@ fn test_json_get_utf8() {
632642
],
633643
number_rows: 1,
634644
return_field: Arc::new(
635-
Field::new("ret_field", DataType::Utf8, false)
636-
.with_metadata(HashMap::from_iter(vec![("is_json".to_string(), "true".to_string())])),
645+
Field::new("ret_field", DataType::Utf8, false).with_metadata(json_field_metadata()),
637646
),
638647
config_options: Arc::new(ConfigOptions::default()),
639648
})
@@ -666,8 +675,7 @@ fn test_json_get_large_utf8() {
666675
],
667676
number_rows: 1,
668677
return_field: Arc::new(
669-
Field::new("ret_field", DataType::Utf8, false)
670-
.with_metadata(HashMap::from_iter(vec![("is_json".to_string(), "true".to_string())])),
678+
Field::new("ret_field", DataType::Utf8, false).with_metadata(json_field_metadata()),
671679
),
672680
config_options: Arc::new(ConfigOptions::default()),
673681
})

0 commit comments

Comments
 (0)