Skip to content

Commit b2c3dc3

Browse files
adriangbclaude
andauthored
Also emit canonical Arrow JSON extension metadata keys (#112)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent a3d9f62 commit b2c3dc3

3 files changed

Lines changed: 32 additions & 16 deletions

File tree

src/common_union.rs

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,24 @@ use datafusion::common::ScalarValue;
1313
///
1414
/// Attach this to any Arrow `Field` whose values are JSON-encoded strings so
1515
/// downstream consumers can recognize them as JSON rather than opaque text.
16+
///
17+
/// Emits Arrow's canonical JSON extension type keys
18+
/// (`ARROW:extension:name` = `arrow.json`, `ARROW:extension:metadata` = `{}`),
19+
/// see <https://arrow.apache.org/docs/format/CanonicalExtensions.html#json>.
20+
///
21+
/// Also emits a legacy `is_json` = `true` key. This key predates this crate's
22+
/// adoption of the canonical extension and is non-standard — no other Arrow
23+
/// tool recognizes it. It is kept only for back-compat with existing
24+
/// downstream consumers of this crate and will be removed in a future
25+
/// release; new consumers should key off `ARROW:extension:name` instead.
26+
#[must_use]
1627
pub fn json_field_metadata() -> HashMap<String, String> {
17-
HashMap::from([("is_json".to_string(), "true".to_string())])
28+
HashMap::from([
29+
("ARROW:extension:name".to_string(), "arrow.json".to_string()),
30+
("ARROW:extension:metadata".to_string(), "{}".to_string()),
31+
// Legacy, non-standard. Remove in a future release — see doc comment above.
32+
("is_json".to_string(), "true".to_string()),
33+
])
1834
}
1935

2036
pub fn is_json_union(data_type: &DataType) -> bool {

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ mod json_length;
2222
mod json_object_keys;
2323
mod rewrite;
2424

25-
pub use common_union::{JsonUnionEncoder, JsonUnionValue, JSON_UNION_DATA_TYPE};
25+
pub use common_union::{json_field_metadata, JsonUnionEncoder, JsonUnionValue, JSON_UNION_DATA_TYPE};
2626

2727
pub mod functions {
2828
pub use crate::json_as_text::json_as_text;

tests/main.rs

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ use datafusion::common::ScalarValue;
99
use datafusion::config::ConfigOptions;
1010
use datafusion::logical_expr::{ColumnarValue, ScalarFunctionArgs};
1111
use datafusion::prelude::SessionContext;
12+
use datafusion_functions_json::json_field_metadata;
1213
use datafusion_functions_json::udfs::json_get_str_udf;
1314
use utils::{create_context, display_val, logical_plan, run_query, run_query_params};
1415

@@ -162,15 +163,15 @@ async fn test_json_get_array_with_path() {
162163
}
163164

164165
#[tokio::test]
165-
async fn test_json_get_array_inner_field_is_json_metadata() {
166+
async fn test_json_get_array_inner_field_json_metadata() {
166167
let sql = r#"select json_get_array('[{"a": 1}, {"b": 2}]') as v"#;
167168
let batches = run_query(sql).await.unwrap();
168169
let schema = batches[0].schema();
169170
let field = schema.field(0);
170171
let DataType::List(inner_field) = field.data_type() else {
171172
panic!("expected List, got {:?}", field.data_type());
172173
};
173-
assert_eq!(inner_field.metadata().get("is_json").map(String::as_str), Some("true"));
174+
assert_json_field_metadata(inner_field.metadata());
174175

175176
let array_field = batches[0]
176177
.column(0)
@@ -180,10 +181,15 @@ async fn test_json_get_array_inner_field_is_json_metadata() {
180181
let DataType::List(produced_inner) = array_field.data_type() else {
181182
panic!("expected List in produced array");
182183
};
184+
assert_json_field_metadata(produced_inner.metadata());
185+
}
186+
187+
fn assert_json_field_metadata(metadata: &HashMap<String, String>) {
183188
assert_eq!(
184-
produced_inner.metadata().get("is_json").map(String::as_str),
185-
Some("true")
189+
metadata.get("ARROW:extension:name").map(String::as_str),
190+
Some("arrow.json")
186191
);
192+
assert_eq!(metadata.get("ARROW:extension:metadata").map(String::as_str), Some("{}"));
187193
}
188194

189195
#[tokio::test]
@@ -437,12 +443,12 @@ async fn test_json_get_json_float() {
437443
}
438444

439445
#[tokio::test]
440-
async fn test_json_get_json_is_json_metadata() {
446+
async fn test_json_get_json_json_metadata() {
441447
let sql = r#"select json_get_json('{"x": [1, 2]}', 'x') as v"#;
442448
let batches = run_query(sql).await.unwrap();
443449
let schema = batches[0].schema();
444450
let field = schema.field(0);
445-
assert_eq!(field.metadata().get("is_json").map(String::as_str), Some("true"));
451+
assert_json_field_metadata(field.metadata());
446452
}
447453

448454
#[tokio::test]
@@ -631,10 +637,7 @@ fn test_json_get_utf8() {
631637
Arc::new(Field::new("arg_3", DataType::LargeUtf8, false)),
632638
],
633639
number_rows: 1,
634-
return_field: Arc::new(
635-
Field::new("ret_field", DataType::Utf8, false)
636-
.with_metadata(HashMap::from_iter(vec![("is_json".to_string(), "true".to_string())])),
637-
),
640+
return_field: Arc::new(Field::new("ret_field", DataType::Utf8, false).with_metadata(json_field_metadata())),
638641
config_options: Arc::new(ConfigOptions::default()),
639642
})
640643
.unwrap()
@@ -665,10 +668,7 @@ fn test_json_get_large_utf8() {
665668
Arc::new(Field::new("arg_3", DataType::LargeUtf8, false)),
666669
],
667670
number_rows: 1,
668-
return_field: Arc::new(
669-
Field::new("ret_field", DataType::Utf8, false)
670-
.with_metadata(HashMap::from_iter(vec![("is_json".to_string(), "true".to_string())])),
671-
),
671+
return_field: Arc::new(Field::new("ret_field", DataType::Utf8, false).with_metadata(json_field_metadata())),
672672
config_options: Arc::new(ConfigOptions::default()),
673673
})
674674
.unwrap()

0 commit comments

Comments
 (0)