uucode (Micro/µ Unicode)

A fast and flexible unicode library, fully configurable at build time.

Basic usage

const uucode = @import("uucode");

var cp: u21 = undefined; // A u21 is the type for a Unicode code point

//////////////////////
// `get` properties

cp = 0x2200; // ∀
uucode.get(.general_category, cp) // .symbol_math

cp = 0x03C2; // ς
uucode.get(.simple_uppercase_mapping, cp) // U+03A3 == Σ

cp = 0x21C1; // ⇁
uucode.get(.name, cp) // "RIGHTWARDS HARPOON WITH BARB DOWNWARDS"

// Many of the []const u21 fields need a single item buffer passed to `with`:
var buffer: [1]u21 = undefined;
cp = 0x00DF; // ß
uucode.get(.uppercase_mapping, cp).with(&buffer, cp) // "SS"

//////////////////////
// `getAll` to get a group of properties for a code point together.

cp = 0x03C2; // ς

// The first argument is the name/index of the table.
const data = uucode.getAll("0", cp);

data.simple_uppercase_mapping // U+03A3 == Σ
data.general_category // .letter_lowercase

//////////////////////
// utf8.Iterator

var it = uucode.utf8.Iterator.init("😀😅😻👺");
it.next(); // 0x1F600
it.i; // 4 (bytes into the utf8 string)
it.peek(); // 0x1F605
it.next(); // 0x1F605
it.next(); // 0x1F63B
it.next(); // 0x1F47A

//////////////////////
// grapheme.Iterator / grapheme.utf8Iterator

var it = uucode.grapheme.utf8Iterator("👩🏽‍🚀🇨🇭👨🏻‍🍼")

// (which is equivalent to:)
var it = uucode.grapheme.Iterator(uccode.utf8.Iterator).init(.init("👩🏽‍🚀🇨🇭👨🏻‍🍼"));

// `nextCodePoint` advances one code point at a time, indicating a new grapheme
// with `is_break = true`.
it.nextCodePoint(); // { .code_point = 0x1F469; .is_break = false } // 👩
it.i; // 4 (bytes into the utf8 string)

it.peekCodePoint(); // { .code_point = 0x1F3FD; .is_break = false } // 🏽
it.nextCodePoint(); // { .code_point = 0x1F3FD; .is_break = false } // 🏽
it.nextCodePoint(); // { .code_point = 0x200D; .is_break = false } // Zero width joiner
it.nextCodePoint(); // { .code_point = 0x1F680; .is_break = true } // 🚀

// `nextGrapheme` advances until the start of the next grapheme cluster
const result = it.nextGrapheme(); // { .start = 15; .end = 23 }
it.i; // "👩🏽‍🚀🇨🇭".len
str[result.?.start..result.?.end]; // "🇨🇭"

const result = it.peekGrapheme();
str[result.?.start..result.?.end]; // "👨🏻‍🍼"

//////////////////////
// grapheme.isBreak

var break_state: uucode.grapheme.BreakState = .default;

var cp1: u21 = 0x1F469; // 👩
var cp2: u21 = 0x1F3FD; // 🏽
uucode.grapheme.isBreak(cp1, cp2, &break_state); // false

cp1 = cp2;
cp2 = 0x200D; // Zero width joiner
uucode.grapheme.isBreak(cp1, cp2, &break_state); // false

cp1 = cp2;
cp2 = 0x1F680; // 🚀
// The combined grapheme cluster is 👩🏽‍🚀 (woman astronaut)
uucode.grapheme.isBreak(cp1, cp2, &break_state); // false

cp1 = cp2;
cp2 = 0x1F468; // 👨
uucode.grapheme.isBreak(cp1, cp2, &break_state); // true

//////////////////////
// x.grapheme.wcwidth{,Next,Remaining} / x.grapheme.utf8Wcwidth

const str = "ò👨🏻‍❤️‍👨🏿_";
var it = uucode.grapheme.utf8Iterator(str);

// Requires the `wcwidth` builtin extension (see below)
uucode.x.grapheme.wcwidth(it); // 1 for 'ò'

uucode.x.grapheme.wcwidthNext(&it); // 1 for 'ò'
const result = it.peekGrapheme();
str[result.?.start..result.?.end]; // "👨🏻‍❤️‍👨🏿"

uucode.x.grapheme.wcwidthRemaining(&it); // 3 for "👨🏻‍❤️‍👨🏿_"

uucode.x.grapheme.utf8Wcwidth(str); // 4 for the whole string

//////////////////////
// TypeOf / TypeOfAll / hasField

uucode.TypeOf(.general_category)  // uucode.types.GeneralCategory
uucode.TypeOfAll("0")             // @TypeOf(uucode.getAll("0"))
uucode.hasField("is_emoji")       // true if `is_emoji` is in one of your tables

See src/fields.zig for the name and type of all fields.

Configuration

Only include the Unicode fields you actually use:

// In `build.zig`:
if (b.lazyDependency("uucode", .{
    .target = target,
    .optimize = optimize,
    .fields = @as([]const []const u8, &.{
        "name",
        "general_category",
        "case_folding_simple",
        "is_alphabetic",
        // ...
    }),
})) |dep| {
    step.root_module.addImport("uucode", dep.module("uucode"));
}

If you forget to add a field you're using, you'll get an error such as:

src/root.zig:36:29: error: enum 'get.FieldEnum__enum_8678' has no member named 'is_not_a_configured_field'
    try testing.expect(get(.is_not_a_configured_field, 65));
                           ~^~~~~~~~~~~~~~~~~~~~~~~~~

Multiple tables

Fields can be split into multiple tables using fields_0 through fields_9, to optimize how fields are stored and accessed (with no code changes needed).

// In `build.zig`:
if (b.lazyDependency("uucode", .{
    .target = target,
    .optimize = optimize,
    .fields_0 = @as([]const []const u8, &.{
        "general_category",
        "case_folding_simple",
        "is_alphabetic",
    }),
    .fields_1 = @as([]const []const u8, &.{
        // ...
    }),
    .fields_2 = @as([]const []const u8, &.{
        // ...
    }),
    // ... `fields_3` to `fields_9`
})) |dep| {
    step.root_module.addImport("uucode", dep.module("uucode"));
}

Advanced configuration

uucode is built on a powerful and flexible component system.

The build_config.zig (created for you with the basic configuration, or created by you in the advanced configuration), declares the following:

pub const fields: []const Field
pub const build_components: []const Component
pub const get_components: []const Component  // Not supported, yet
pub const tables: []const Table

...and the Component and Table types both include a slice of field names fields: []const []const u8, where the component declares that it builds those fields, and the table declares that it stores the data for those fields.

See src/config.zig for the configuration types, src/generate.zig for how components are used to generate the tables.zig, and src/components.zig for all builtin components.

///////////////////////////////////////////////////////////
// In `build.zig`:

b.dependency("uucode", .{
    .target = target,
    .optimize = optimize,
    .build_config_path = b.path("src/build/uucode_config.zig"),

    // Alternatively, use a string literal:
    //.@"build_config.zig" = "..."
})

///////////////////////////////////////////////////////////
// In `src/build/uucode_config.zig`:

const std = @import("std");
const config = @import("config.zig");

// Define the fields you use in your components and tables (it's okay to have
// unused ones).
pub const fields = &config.mergeFields(config.fields, &.{
    .{ .name = "emoji_odd_or_even", .type = EmojiOddOrEven },

    // See `src/config.zig` for everything that can be overriden.
    // In this example, we're embedding 15 bytes into the `stage3` data,
    // and only names longer than that need to use the `backing` slice.
    config.field(config.fields, "name").override(.{
        .embedded_len = 15,
        .max_offset = 986096, // run once to get the correct number
    }),
});

// Define the components you use:
pub const build_components = &config.mergeComponents(config.build_components, &.{
    .{
        .Impl = EmojiOddOrEvenComponent,
        .inputs = &.{"is_emoji"},
        .fields = &.{"emoji_odd_or_even"},
    },
});

const EmojiOddOrEvenComponent = struct {
    pub fn build(
        comptime InputRow: type,
        comptime Row: type,
        allocator: std.mem.Allocator,
        inputs: config.MultiSlice(InputRow),
        rows: *config.MultiSlice(Row),
        backing: anytype,
        tracking: anytype,
    ) !void {
        // allocator is an ArenaAllocator, so don't worry about freeing
        _ = allocator;

        // backing is used for Slice types, but could be used for custom types
        // that need outside-of-table storage
        _ = backing;

        // tracking is used for some types that need to be configured ahead
        // of time, like the maximum and minimum value found in the data
        _ = tracking;

        // If the component only defines one field, it's generally better
        // to set `rows.len` and then just assign to positions in
        // `rows.items(.<field>)`. If setting multiple fields, it may be
        // better to use `rows.append` (see src/components.zig)
        rows.len = config.num_code_points;
        const items = rows.items(.emoji_odd_or_even);
        const input_items = inputs.items(.is_emoji);

        for (0..config.num_code_points) |i| {
            const cp: u21 = @intCast(i);
            items[i] = if (!input_items[i])
                EmojiOddOrEven.not_emoji
            else if (cp % 2 == 0)
                EmojiOddOrEven.even_emoji
            else
                EmojiOddOrEven.odd_emoji;
        }
    }
};

const EmojiOddOrEven = enum(u2) {
    not_emoji,
    even_emoji,
    odd_emoji,
};

// Configure tables with the `tables` declaration.
// The only required field is `fields`, and the rest have reasonable defaults.
pub const tables: []const config.Table = &.{
    .{
        // Optional name, to be able to `getAll("foo")` rather than e.g.
        // `getAll("0")`
        .name = "foo",

        // A two stage table can be slightly faster if the data is small. The
        // default `.auto` will pick a reasonable value, but to get the
        // absolute best performance run benchmarks with `.two` or `.three`
        // on realistic data.
        .stages = .three,

        // The default `.auto` value decide whether the final data stage struct
        // should be a `packed struct` (.@"packed") or a regular Zig `struct`.
        .packing = .unpacked,

        .fields = &.{
            "emoji_odd_or_even",
            "general_category",
            "block",
            "wcwidth_standalone",
            "wcwidth_zero_in_grapheme",
            // ...
        },
    },
};

// Turn on debug logging:
pub const log_level = .debug;

///////////////////////////////////////////////////////////
// In your code:

const uucode = @import("uucode");

uucode.get(.emoji_odd_or_even, 0x1F34B) // 🍋 == .odd_emoji

uucode.get(.wcwidth_standalone, 0x26F5) // ⛵ == 2

For more examples of advanced configuration and extensions, see src/test/build_config.zig.

Special types

uucode has a number of special types to handle certain unicode fields.

These fields are configured in src/config.zig with the Field struct:

pub const Field = struct {
    name: [:0]const u8,
    type: type,

    // For Shift + Slice fields
    cp_packing: CpPacking = .direct,
    shift_low: isize = 0,
    shift_high: isize = 0,

    // For Slice fields
    max_len: usize = 0,
    max_offset: usize = 0,
    embedded_len: usize = 0,

    // For PackedOptional fields
    min_value: isize = 0,
    max_value: isize = 0,

    ...
};

The special types are as follows.

PackedOptional

This allows for packing an optional field, by fitting the "null" as the maxInt of the IntFittingRange between the configured min_value and max_value + 1.

Shift

This allows for including u21 fields without bloating the data required to store the field. A number of unicode fields default to mapping to themselves (e.g. simple_uppercase_mapping) or some other code point that might not be too far away in value. The Shift type internally stores the difference between the current code point and the u21 value, and at the time of get, adds this difference back to get the actual value.

To use a Shift field, the following need to be set:

.cp_packing = .shift
.shift_low
.shift_high

Temporarily configure shift_low to -@as(isize, config.max_code_point) and shift_high to config.max_code_point, then run the build to get the actual values.

error: Config for field 'uppercase_mapping_first_char' does not match actual. Set .shift_low = -64190, // change from -1114111
error: Config for field 'uppercase_mapping_first_char' does not match actual. Set .shift_high = 42561, // change from 1114111
thread 38963456 panic: Table config doesn't match actual. See above for details

Union

This allows for including union fields in packed tables, or with shift fields. No special configuration is needed, unless there is a shift field, then the above shift_low and shift_high need to be set.

Slice

This allows for including slice fields ([]const T). Unsupported for packed tables. This can also include a shift for when the slice is u21, used only when the length is 1.

A Slice field stores its data as a union in either embedded in a fix-sized array (embedded: [embedded_len]T), in the shift field (for u21 slice of length 1), or as an offset: Offset pointing to a "backing" buffer.

To use a Slice field, the following need to be set:

.max_len: The maximum length of the slice across all code points.
.max_offset: The maximum offset in the offset field (technically, the length of the backing buffer after iterating over all code points).
.embedded_len: The length of the embedded array, defaulting to zero.
.shift_low: Needs to be configured as in Shift above for u21 element types.
.shift_high: Needs to be configured as in Shift above for u21 element types.

When using embedded_len > 0, the slice will be stored in the embedded array if it fits, otherwise in the backing buffer.

History and acknowledgments

uucode began out of work on the Ghostty terminal on an issue to upgrade dependencies, where the experience modifying zg gave the confidence to build a fresh new library.

uucode builds upon the Unicode performance work done in Ghostty, as outlined in this excellent Devlog. The 3-stage lookup tables, as mentioned in that Devlog, come from this article.

License

uucode is available under an MIT License. See ./LICENSE.md for the license text and an index of licenses for code used in the repo.

Resources

See ./RESOURCES.md for a list of resources used to build uucode.

Name		Name	Last commit message	Last commit date
Latest commit History 386 Commits
bin		bin
licenses		licenses
resources		resources
src		src
ucd		ucd
.gitignore		.gitignore
AGENTS.md		AGENTS.md
LICENSE.md		LICENSE.md
README.md		README.md
RESOURCES.md		RESOURCES.md
build.zig		build.zig
build.zig.zon		build.zig.zon
mise.toml		mise.toml

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

uucode (Micro/µ Unicode)

Basic usage

Configuration

Multiple tables

Advanced configuration

Special types

PackedOptional

Shift

Union

Slice

History and acknowledgments

License

Resources

About

Uh oh!

Releases

Packages

Uh oh!

Contributors

Uh oh!

Languages

Folders and files

Latest commit

History

Repository files navigation

uucode (Micro/µ Unicode)

Basic usage

Configuration

Multiple tables

Advanced configuration

Special types

PackedOptional

Shift

Union

Slice

History and acknowledgments

License

Resources

About

Resources

License

Uh oh!

Stars

Watchers

Forks

Releases

Packages 0

Uh oh!

Contributors

Uh oh!

Languages

Packages