From 80a8257436d93597ab0ef4779b54de66965f6c33 Mon Sep 17 00:00:00 2001
From: Ilia Alshanetsky <ilia@ilia.ws>
Date: Sun, 10 May 2026 18:47:53 -0400
Subject: [PATCH] Add YYJSON_WRITE_LOWERCASE_HEX flag for \uXXXX escape case
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both RFC 8259 §7 forms are valid, but the rest of the JSON ecosystem
(ext/json, Python json, Node.js JSON.stringify, Go encoding/json,
Ruby json, Jackson, RapidJSON) defaults to lowercase hex. yyjson is
the conspicuous outlier with uppercase. Callers shipping yyjson
output to those consumers currently need a post-processing pass for
byte-equality.

This flag selects a second pre-computed 512-byte hex table at the
top of each writer call; the per-byte hot loop is identical. No
performance impact on either path.

Implementation:
- Add esc_hex_char_table_lower next to the existing uppercase table
- get_hex_table_with_flag() returns the right table based on flag
- write_str() takes hex_table as parameter; threaded through the 5
  write driver functions that call it
- The U+FFFD replacement in the invalid-UTF-8 error path now reads
  its 4 hex bytes from hex_table too (two byte_copy_2 instead of one
  byte_copy_4 from a hardcoded rep v32; same total output, error
  path is not perf-critical)
---
 src/yyjson.c | 130 ++++++++++++++++++++++++++++++++++++++++++++-------
 src/yyjson.h |  12 +++++
 2 files changed, 124 insertions(+), 18 deletions(-)

diff --git a/src/yyjson.c b/src/yyjson.c
index c16d925..783bf4d 100644
--- a/src/yyjson.c
+++ b/src/yyjson.c
@@ -8506,6 +8506,78 @@ static const u8 esc_hex_char_table[512] = {
     'F', 'C', 'F', 'D', 'F', 'E', 'F', 'F'
 };
 
+/** Lowercase variant of esc_hex_char_table. Selected at write time
+    when the caller passes YYJSON_WRITE_LOWERCASE_HEX. Matches the
+    \uXXXX case used by ext/json (PHP), Python json, Node.js
+    JSON.stringify, Go encoding/json, Ruby json. */
+yyjson_align(2)
+static const u8 esc_hex_char_table_lower[512] = {
+    '0', '0', '0', '1', '0', '2', '0', '3',
+    '0', '4', '0', '5', '0', '6', '0', '7',
+    '0', '8', '0', '9', '0', 'a', '0', 'b',
+    '0', 'c', '0', 'd', '0', 'e', '0', 'f',
+    '1', '0', '1', '1', '1', '2', '1', '3',
+    '1', '4', '1', '5', '1', '6', '1', '7',
+    '1', '8', '1', '9', '1', 'a', '1', 'b',
+    '1', 'c', '1', 'd', '1', 'e', '1', 'f',
+    '2', '0', '2', '1', '2', '2', '2', '3',
+    '2', '4', '2', '5', '2', '6', '2', '7',
+    '2', '8', '2', '9', '2', 'a', '2', 'b',
+    '2', 'c', '2', 'd', '2', 'e', '2', 'f',
+    '3', '0', '3', '1', '3', '2', '3', '3',
+    '3', '4', '3', '5', '3', '6', '3', '7',
+    '3', '8', '3', '9', '3', 'a', '3', 'b',
+    '3', 'c', '3', 'd', '3', 'e', '3', 'f',
+    '4', '0', '4', '1', '4', '2', '4', '3',
+    '4', '4', '4', '5', '4', '6', '4', '7',
+    '4', '8', '4', '9', '4', 'a', '4', 'b',
+    '4', 'c', '4', 'd', '4', 'e', '4', 'f',
+    '5', '0', '5', '1', '5', '2', '5', '3',
+    '5', '4', '5', '5', '5', '6', '5', '7',
+    '5', '8', '5', '9', '5', 'a', '5', 'b',
+    '5', 'c', '5', 'd', '5', 'e', '5', 'f',
+    '6', '0', '6', '1', '6', '2', '6', '3',
+    '6', '4', '6', '5', '6', '6', '6', '7',
+    '6', '8', '6', '9', '6', 'a', '6', 'b',
+    '6', 'c', '6', 'd', '6', 'e', '6', 'f',
+    '7', '0', '7', '1', '7', '2', '7', '3',
+    '7', '4', '7', '5', '7', '6', '7', '7',
+    '7', '8', '7', '9', '7', 'a', '7', 'b',
+    '7', 'c', '7', 'd', '7', 'e', '7', 'f',
+    '8', '0', '8', '1', '8', '2', '8', '3',
+    '8', '4', '8', '5', '8', '6', '8', '7',
+    '8', '8', '8', '9', '8', 'a', '8', 'b',
+    '8', 'c', '8', 'd', '8', 'e', '8', 'f',
+    '9', '0', '9', '1', '9', '2', '9', '3',
+    '9', '4', '9', '5', '9', '6', '9', '7',
+    '9', '8', '9', '9', '9', 'a', '9', 'b',
+    '9', 'c', '9', 'd', '9', 'e', '9', 'f',
+    'a', '0', 'a', '1', 'a', '2', 'a', '3',
+    'a', '4', 'a', '5', 'a', '6', 'a', '7',
+    'a', '8', 'a', '9', 'a', 'a', 'a', 'b',
+    'a', 'c', 'a', 'd', 'a', 'e', 'a', 'f',
+    'b', '0', 'b', '1', 'b', '2', 'b', '3',
+    'b', '4', 'b', '5', 'b', '6', 'b', '7',
+    'b', '8', 'b', '9', 'b', 'a', 'b', 'b',
+    'b', 'c', 'b', 'd', 'b', 'e', 'b', 'f',
+    'c', '0', 'c', '1', 'c', '2', 'c', '3',
+    'c', '4', 'c', '5', 'c', '6', 'c', '7',
+    'c', '8', 'c', '9', 'c', 'a', 'c', 'b',
+    'c', 'c', 'c', 'd', 'c', 'e', 'c', 'f',
+    'd', '0', 'd', '1', 'd', '2', 'd', '3',
+    'd', '4', 'd', '5', 'd', '6', 'd', '7',
+    'd', '8', 'd', '9', 'd', 'a', 'd', 'b',
+    'd', 'c', 'd', 'd', 'd', 'e', 'd', 'f',
+    'e', '0', 'e', '1', 'e', '2', 'e', '3',
+    'e', '4', 'e', '5', 'e', '6', 'e', '7',
+    'e', '8', 'e', '9', 'e', 'a', 'e', 'b',
+    'e', 'c', 'e', 'd', 'e', 'e', 'e', 'f',
+    'f', '0', 'f', '1', 'f', '2', 'f', '3',
+    'f', '4', 'f', '5', 'f', '6', 'f', '7',
+    'f', '8', 'f', '9', 'f', 'a', 'f', 'b',
+    'f', 'c', 'f', 'd', 'f', 'e', 'f', 'f'
+};
+
 /** Escaped single character table. (generate with misc/make_tables.c) */
 yyjson_align(2)
 static const u8 esc_single_char_table[512] = {
@@ -8575,6 +8647,16 @@ static const u8 esc_single_char_table[512] = {
     ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '
 };
 
+/** Returns the hex digit table to use for \uXXXX escapes. The
+    LOWERCASE_HEX flag selects lowercase 'a'..'f' to match the
+    ext/json / Python json / Node.js JSON.stringify / Go encoding/json
+    convention; default is uppercase 'A'..'F'. */
+static_inline const u8 *get_hex_table_with_flag(yyjson_write_flag flg) {
+    return has_flg(LOWERCASE_HEX)
+        ? esc_hex_char_table_lower
+        : esc_hex_char_table;
+}
+
 /** Returns the encode table with options. */
 static_inline const char_enc_type *get_enc_table_with_flag(
     yyjson_write_flag flg) {
@@ -8640,9 +8722,11 @@ static_inline u8 *write_str_noesc(u8 *cur, const u8 *str, usize str_len) {
  */
 static_inline u8 *write_str(u8 *cur, bool esc, bool inv,
                             const u8 *str, usize str_len,
-                            const char_enc_type *enc_table) {
-    /* The replacement character U+FFFD, used to indicate invalid character. */
-    const v32 rep = {{ 'F', 'F', 'F', 'D' }};
+                            const char_enc_type *enc_table,
+                            const u8 *hex_table) {
+    /* The replacement character U+FFFD, used to indicate invalid character.
+       Looked up via hex_table so that LOWERCASE_HEX produces "fffd" while
+       the default produces "FFFD". */
     const v32 pre = {{ '\\', 'u', '0', '0' }};
 
     const u8 *src = str;
@@ -8759,7 +8843,7 @@ static_inline u8 *write_str(u8 *cur, bool esc, bool inv,
         }
         case CHAR_ENC_ESC_1: {
             byte_copy_4(cur + 0, &pre);
-            byte_copy_2(cur + 4, &esc_hex_char_table[*src * 2]);
+            byte_copy_2(cur + 4, &hex_table[*src * 2]);
             cur += 6;
             src += 1;
             goto copy_utf8;
@@ -8775,8 +8859,8 @@ static_inline u8 *write_str(u8 *cur, bool esc, bool inv,
             u = (u16)(((u16)(src[0] & 0x1F) << 6) |
                       ((u16)(src[1] & 0x3F) << 0));
             byte_copy_2(cur + 0, &pre);
-            byte_copy_2(cur + 2, &esc_hex_char_table[(u >> 8) * 2]);
-            byte_copy_2(cur + 4, &esc_hex_char_table[(u & 0xFF) * 2]);
+            byte_copy_2(cur + 2, &hex_table[(u >> 8) * 2]);
+            byte_copy_2(cur + 4, &hex_table[(u & 0xFF) * 2]);
             cur += 6;
             src += 2;
             goto copy_utf8;
@@ -8792,8 +8876,8 @@ static_inline u8 *write_str(u8 *cur, bool esc, bool inv,
                       ((u16)(src[1] & 0x3F) << 6) |
                       ((u16)(src[2] & 0x3F) << 0));
             byte_copy_2(cur + 0, &pre);
-            byte_copy_2(cur + 2, &esc_hex_char_table[(u >> 8) * 2]);
-            byte_copy_2(cur + 4, &esc_hex_char_table[(u & 0xFF) * 2]);
+            byte_copy_2(cur + 2, &hex_table[(u >> 8) * 2]);
+            byte_copy_2(cur + 4, &hex_table[(u & 0xFF) * 2]);
             cur += 6;
             src += 3;
             goto copy_utf8;
@@ -8812,11 +8896,11 @@ static_inline u8 *write_str(u8 *cur, bool esc, bool inv,
             hi = (u >> 10) + 0xD800;
             lo = (u & 0x3FF) + 0xDC00;
             byte_copy_2(cur + 0, &pre);
-            byte_copy_2(cur + 2, &esc_hex_char_table[(hi >> 8) * 2]);
-            byte_copy_2(cur + 4, &esc_hex_char_table[(hi & 0xFF) * 2]);
+            byte_copy_2(cur + 2, &hex_table[(hi >> 8) * 2]);
+            byte_copy_2(cur + 4, &hex_table[(hi & 0xFF) * 2]);
             byte_copy_2(cur + 6, &pre);
-            byte_copy_2(cur + 8, &esc_hex_char_table[(lo >> 8) * 2]);
-            byte_copy_2(cur + 10, &esc_hex_char_table[(lo & 0xFF) * 2]);
+            byte_copy_2(cur + 8, &hex_table[(lo >> 8) * 2]);
+            byte_copy_2(cur + 10, &hex_table[(lo & 0xFF) * 2]);
             cur += 12;
             src += 4;
             goto copy_utf8;
@@ -8843,7 +8927,12 @@ static_inline u8 *write_str(u8 *cur, bool esc, bool inv,
 err_esc:
     if (!inv) return NULL;
     byte_copy_2(cur + 0, &pre);
-    byte_copy_4(cur + 2, &rep);
+    /* U+FFFD = 0xFFFD, written as two pairs from hex_table so that
+       LOWERCASE_HEX produces "fffd". Replaces a single byte_copy_4
+       from a hardcoded uppercase "FFFD" v32; same total output, one
+       extra load on the (rare) invalid-UTF-8-with-ALLOW path. */
+    byte_copy_2(cur + 2, &hex_table[0xFF * 2]);
+    byte_copy_2(cur + 4, &hex_table[0xFD * 2]);
     cur += 6;
     src += 1;
     goto copy_utf8;
@@ -8972,6 +9061,7 @@ static_inline u8 *yyjson_write_single(yyjson_val *val,
     usize str_len;
     const u8 *str_ptr;
     const char_enc_type *enc_table = get_enc_table_with_flag(flg);
+    const u8 *hex_table = get_hex_table_with_flag(flg);
     bool cpy = (enc_table == enc_table_cpy);
     bool esc = has_flg(ESCAPE_UNICODE) != 0;
     bool inv = has_allow(INVALID_UNICODE) != 0;
@@ -8995,7 +9085,7 @@ static_inline u8 *yyjson_write_single(yyjson_val *val,
             if (likely(cpy) && unsafe_yyjson_get_subtype(val)) {
                 cur = write_str_noesc(cur, str_ptr, str_len);
             } else {
-                cur = write_str(cur, esc, inv, str_ptr, str_len, enc_table);
+                cur = write_str(cur, esc, inv, str_ptr, str_len, enc_table, hex_table);
                 if (unlikely(!cur)) goto fail_str;
             }
             break;
@@ -9099,6 +9189,7 @@ static_inline u8 *yyjson_write_minify(const yyjson_val *root,
     usize alc_len, alc_inc, ctx_len, ext_len, str_len;
     const u8 *str_ptr;
     const char_enc_type *enc_table = get_enc_table_with_flag(flg);
+    const u8 *hex_table = get_hex_table_with_flag(flg);
     bool cpy = (enc_table == enc_table_cpy);
     bool esc = has_flg(ESCAPE_UNICODE) != 0;
     bool inv = has_allow(INVALID_UNICODE) != 0;
@@ -9132,7 +9223,7 @@ static_inline u8 *yyjson_write_minify(const yyjson_val *root,
         if (likely(cpy) && unsafe_yyjson_get_subtype(val)) {
             cur = write_str_noesc(cur, str_ptr, str_len);
         } else {
-            cur = write_str(cur, esc, inv, str_ptr, str_len, enc_table);
+            cur = write_str(cur, esc, inv, str_ptr, str_len, enc_table, hex_table);
             if (unlikely(!cur)) goto fail_str;
         }
         *cur++ = is_key ? ':' : ',';
@@ -9280,6 +9371,7 @@ static_inline u8 *yyjson_write_pretty(const yyjson_val *root,
     usize alc_len, alc_inc, ctx_len, ext_len, str_len, level;
     const u8 *str_ptr;
     const char_enc_type *enc_table = get_enc_table_with_flag(flg);
+    const u8 *hex_table = get_hex_table_with_flag(flg);
     bool cpy = (enc_table == enc_table_cpy);
     bool esc = has_flg(ESCAPE_UNICODE) != 0;
     bool inv = has_allow(INVALID_UNICODE) != 0;
@@ -9318,7 +9410,7 @@ static_inline u8 *yyjson_write_pretty(const yyjson_val *root,
         if (likely(cpy) && unsafe_yyjson_get_subtype(val)) {
             cur = write_str_noesc(cur, str_ptr, str_len);
         } else {
-            cur = write_str(cur, esc, inv, str_ptr, str_len, enc_table);
+            cur = write_str(cur, esc, inv, str_ptr, str_len, enc_table, hex_table);
             if (unlikely(!cur)) goto fail_str;
         }
         *cur++ = is_key ? ':' : ',';
@@ -9653,6 +9745,7 @@ static_inline u8 *yyjson_mut_write_minify(const yyjson_mut_val *root,
     usize alc_len, alc_inc, ctx_len, ext_len, str_len;
     const u8 *str_ptr;
     const char_enc_type *enc_table = get_enc_table_with_flag(flg);
+    const u8 *hex_table = get_hex_table_with_flag(flg);
     bool cpy = (enc_table == enc_table_cpy);
     bool esc = has_flg(ESCAPE_UNICODE) != 0;
     bool inv = has_allow(INVALID_UNICODE) != 0;
@@ -9687,7 +9780,7 @@ static_inline u8 *yyjson_mut_write_minify(const yyjson_mut_val *root,
         if (likely(cpy) && unsafe_yyjson_get_subtype(val)) {
             cur = write_str_noesc(cur, str_ptr, str_len);
         } else {
-            cur = write_str(cur, esc, inv, str_ptr, str_len, enc_table);
+            cur = write_str(cur, esc, inv, str_ptr, str_len, enc_table, hex_table);
             if (unlikely(!cur)) goto fail_str;
         }
         *cur++ = is_key ? ':' : ',';
@@ -9840,6 +9933,7 @@ static_inline u8 *yyjson_mut_write_pretty(const yyjson_mut_val *root,
     usize alc_len, alc_inc, ctx_len, ext_len, str_len, level;
     const u8 *str_ptr;
     const char_enc_type *enc_table = get_enc_table_with_flag(flg);
+    const u8 *hex_table = get_hex_table_with_flag(flg);
     bool cpy = (enc_table == enc_table_cpy);
     bool esc = has_flg(ESCAPE_UNICODE) != 0;
     bool inv = has_allow(INVALID_UNICODE) != 0;
@@ -9879,7 +9973,7 @@ static_inline u8 *yyjson_mut_write_pretty(const yyjson_mut_val *root,
         if (likely(cpy) && unsafe_yyjson_get_subtype(val)) {
             cur = write_str_noesc(cur, str_ptr, str_len);
         } else {
-            cur = write_str(cur, esc, inv, str_ptr, str_len, enc_table);
+            cur = write_str(cur, esc, inv, str_ptr, str_len, enc_table, hex_table);
             if (unlikely(!cur)) goto fail_str;
         }
         *cur++ = is_key ? ':' : ',';
diff --git a/src/yyjson.h b/src/yyjson.h
index 5eb6d46..b5391e5 100644
--- a/src/yyjson.h
+++ b/src/yyjson.h
@@ -1192,6 +1192,18 @@ static const yyjson_write_flag YYJSON_WRITE_PRETTY_TWO_SPACES       = 1 << 6;
     This can be helpful for text editors or NDJSON. */
 static const yyjson_write_flag YYJSON_WRITE_NEWLINE_AT_END          = 1 << 7;
 
+/** Use lowercase hex digits in `\uXXXX` escape sequences (e.g. `Ä`)
+    instead of the default uppercase (`Ä`). Both forms are spec-
+    equivalent per RFC 8259 §7, but the rest of the JSON ecosystem
+    (ext/json, Python `json`, Node.js `JSON.stringify`, Go
+    `encoding/json`, Ruby `json`, Jackson) defaults to lowercase. This
+    flag lets callers shipping output to those consumers achieve
+    byte-equality without a post-processing pass.
+    No performance impact: the writer selects between two pre-computed
+    512-byte hex tables at the top of each call; the per-byte hot loop
+    is identical. */
+static const yyjson_write_flag YYJSON_WRITE_LOWERCASE_HEX           = 1 << 8;
+
 
 
 /** The highest 8 bits of `yyjson_write_flag` and real number value's `tag`