Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions components/normalizer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ icu_properties = ["dep:icu_properties"]
utf16_iter = ["dep:utf16_iter", "dep:write16"]
# For dealing with potentially ill-formed UTF8 strings
utf8_iter = ["dep:utf8_iter"]
# For dealing with Latin1 strings
latin1 = ["dep:write16"]
harfbuzz_traits = ["dep:harfbuzz-traits"]

# added by accident
Expand Down
280 changes: 280 additions & 0 deletions components/normalizer/src/latin1.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

//! Methods for normalizing Latin1 input into a UTF-16 sink.
//!
//! NFC is not available, since Latin1 input is already known to be
//! in NFC.

use write16::Write16;

/// Entries start from U+00A0 NO-BREAK SPACE. If the character is
/// always its own normalization, the value in the table is 0.
/// If the character has a compatibility decompositons, the value
/// in the table is the index into `COMPATIBILITY_DECOMPOSITIONS`
/// shifted left by two and the length of the subslice of
/// `COMPATIBILITY_DECOMPOSITIONS` in the low 2 bits. This means
/// that the high half is zero. Otherwise, the high 8 bits are the
/// first character of the canonical decomposition and the low 8
/// bits are the offset that needs to be added to U+0300 to get the
/// second character of the canonical decomposition.
static TABLE: [u16; 96] = [
0x01, // nbsp
0, // ¡
0, // ¢
0, // £
0, // ¤
0, // ¥
0, // ¦
0, // §
0x02, // ¨
0, // ©
0x09, // ª
0, // «
0, // ¬
0, // shy
0, // ®
0x0E, // ¯
0, // °
0, // ±
0x41, // ²
0x45, // ³
0x16, // ´
0x1D, // µ
0, // ¶
0, // ·
0x22, // ¸
0x2D, // ¹
0x29, // º
0, // »
0x2F, // ¼
0x3B, // ½
0x47, // ¾
0, // ¿
0x4100, // À
0x4101, // Á
0x4102, // Â
0x4103, // Ã
0x4108, // Ä
0x410A, // Å
0, // Æ
0x4327, // Ç
0x4500, // È
0x4501, // É
0x4502, // Ê
0x4508, // Ë
0x4900, // Ì
0x4901, // Í
0x4902, // Î
0x4908, // Ï
0, // Ð
0x4E03, // Ñ
0x4F00, // Ò
0x4F01, // Ó
0x4F02, // Ô
0x4F03, // Õ
0x4F08, // Ö
0, // ×
0, // Ø
0x5500, // Ù
0x5501, // Ú
0x5502, // Û
0x5508, // Ü
0x5901, // Ý
0, // Þ
0, // ß
0x6100, // à
0x6101, // á
0x6102, // â
0x6103, // ã
0x6108, // ä
0x610A, // å
0, // æ
0x6327, // ç
0x6500, // è
0x6501, // é
0x6502, // ê
0x6508, // ë
0x6900, // ì
0x6901, // í
0x6902, // î
0x6908, // ï
0, // ð
0x6E03, // ñ
0x6F00, // ò
0x6F01, // ó
0x6F02, // ô
0x6F03, // õ
0x6F08, // ö
0, // ÷
0, // ø
0x7500, // ù
0x7501, // ú
0x7502, // û
0x7508, // ü
0x7901, // ý
0, // þ
0x7908, // ÿ
];

/// Table containing the compatibility decompositions.
static COMPATIBILITY_DECOMPOSITIONS: [u16; 20] = [
0x0020, 0x0308, 0x0061, 0x0020, 0x0304, 0x0020, 0x0301, 0x03BC, 0x0020, 0x0327, 0x006F, 0x0031,
0x2044, 0x0034, 0x0031, 0x2044, 0x0032, 0x0033, 0x2044, 0x0034,
];

/// Writes the compatibility decompostion of `c` to `sink`.
fn compatibility_decomposition(val: u16) -> &'static [u16] {
debug_assert!(val <= 0xFF);
let len = val & 0b11;
let index = val >> 2;
COMPATIBILITY_DECOMPOSITIONS
.get(index as usize..index as usize + len as usize)
.unwrap_or_else(|| {
// Internal bug, not even GIGO, never supposed to happen
debug_assert!(false);
&[]
})
}

/// Normalize Latin1 `text` to NFD UTF-16 written to `sink`.
pub fn normalize_nfd_to<W: Write16 + ?Sized>(text: &[u8], sink: &mut W) -> core::fmt::Result {
// Indexing is OK, because the index is statically in range.
#[expect(clippy::indexing_slicing)]
let table = &TABLE[0x20..];
for c in text {
if let Some(val) = table.get(c.wrapping_sub(0xC0) as usize) {
let v = *val;
if v != 0 {
sink.write_slice(&[v >> 8, (v & 0xFF) + 0x0300])?;
continue;
}
}
sink.write_slice(&[*c as u16])?;
}
Ok(())
}

/// Normalize Latin1 `text` to NFKD UTF-16 written to `sink`.
pub fn normalize_nfkd_to<W: Write16 + ?Sized>(text: &[u8], sink: &mut W) -> core::fmt::Result {
for c in text {
if let Some(val) = TABLE.get(c.wrapping_sub(0xA0) as usize) {
let v = *val;
if v == 0 {
// Fall through
} else {
let hi = v >> 8;
if hi != 0 {
sink.write_slice(&[hi, (v & 0xFF) + 0x0300])?;
continue;
} else {
sink.write_slice(compatibility_decomposition(v))?;
continue;
}
}
}
sink.write_slice(&[*c as u16])?;
}
Ok(())
}

/// Normalize Latin1 `text` to NFKC UTF-16 written to `sink`.
pub fn normalize_nfkc_to<W: Write16 + ?Sized>(text: &[u8], sink: &mut W) -> core::fmt::Result {
// Indexing is OK, because the index is statically in range.
#[expect(clippy::indexing_slicing)]
let table = &TABLE[..0x20];
for c in text {
if let Some(val) = table.get(c.wrapping_sub(0xA0) as usize) {
let v = *val;
if v != 0 {
sink.write_slice(compatibility_decomposition(v))?;
continue;
}
}
sink.write_slice(&[*c as u16])?;
}
Ok(())
}

/// Split Latin1 `text` into `(head, tail)` such that the first
/// byte of `tail` is the first byte of input that is not in NFD.
/// If `text` is fully in NFD, `tail` is empty.
pub fn split_normalized_nfd(text: &[u8]) -> (&[u8], &[u8]) {
// Indexing is OK, because the index is statically in range.
#[expect(clippy::indexing_slicing)]
let table = &TABLE[0x20..];
let mut iter = text.iter();
loop {
if let Some(c) = iter.next() {
if let Some(val) = table.get(c.wrapping_sub(0xC0) as usize) {
if *val != 0 {
let tail = iter.as_slice();
return text
.split_at_checked(text.len() - tail.len() - 1)
.unwrap_or_else(|| {
// Internal bug, not even GIGO, never supposed to happen
debug_assert!(false);
(&[], text)
});
}
}
} else {
return (text, &[]);
}
}
}

/// Split Latin1 `text` into `(head, tail)` such that the first
/// byte of `tail` is the first byte of input that is not in NFKD.
/// If `text` is fully in NFKD, `tail` is empty.
pub fn split_normalized_nfkd(text: &[u8]) -> (&[u8], &[u8]) {
let mut iter = text.iter();
loop {
if let Some(c) = iter.next() {
if let Some(val) = TABLE.get(c.wrapping_sub(0xA0) as usize) {
if *val != 0 {
let tail = iter.as_slice();
return text
.split_at_checked(text.len() - tail.len() - 1)
.unwrap_or_else(|| {
// Internal bug, not even GIGO, never supposed to happen
debug_assert!(false);
(&[], text)
});
}
}
} else {
return (text, &[]);
}
}
}

/// Split Latin1 `text` into `(head, tail)` such that the first
/// byte of `tail` is the first byte of input that is not in NFKC.
/// If `text` is fully in NFKC, `tail` is empty.
pub fn split_normalized_nfkc(text: &[u8]) -> (&[u8], &[u8]) {
// Indexing is OK, because the index is statically in range.
#[expect(clippy::indexing_slicing)]
let table = &TABLE[..0x20];
let mut iter = text.iter();
loop {
if let Some(c) = iter.next() {
if let Some(val) = table.get(c.wrapping_sub(0xA0) as usize) {
let v = *val;
if v != 0 {
let tail = iter.as_slice();
return text
.split_at_checked(text.len() - tail.len() - 1)
.unwrap_or_else(|| {
// Internal bug, not even GIGO, never supposed to happen
debug_assert!(false);
(&[], text)
});
}
}
} else {
return (text, &[]);
}
}
}
2 changes: 2 additions & 0 deletions components/normalizer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ macro_rules! ccc {

#[cfg(feature = "harfbuzz_traits")]
mod harfbuzz;
#[cfg(feature = "latin1")]
pub mod latin1;
pub mod properties;
pub mod provider;
pub mod uts46;
Expand Down
81 changes: 81 additions & 0 deletions components/normalizer/tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2081,3 +2081,84 @@ fn test_is_normalized_up_to() {
0
);
}

#[test]
fn test_latin1_split_normalized_nfd() {
assert_eq!(
icu_normalizer::latin1::split_normalized_nfd(b"abc\xA8\xE4efg"),
(&b"abc\xA8"[..], &b"\xE4efg"[..])
);
}

#[test]
fn test_latin1_split_normalized_nfkd() {
assert_eq!(
icu_normalizer::latin1::split_normalized_nfkd(b"abc\xA8\xE4efg"),
(&b"abc"[..], &b"\xA8\xE4efg"[..])
);
}

#[test]
fn test_latin1_split_normalized_nfkc() {
assert_eq!(
icu_normalizer::latin1::split_normalized_nfkc(b"abc\xE4\xA8efg"),
(&b"abc\xE4"[..], &b"\xA8efg"[..])
);
}

#[test]
fn test_latin1_normalize_nfd_to() {
let mut text: Vec<u8> = Vec::new();
for c in 0..=255u8 {
text.push(c);
}
let mut normalized: Vec<u16> = Vec::new();
assert!(icu_normalizer::latin1::normalize_nfd_to(&text, &mut normalized).is_ok());
let nfd = DecomposingNormalizerBorrowed::new_nfd();
let mut text16: Vec<u16> = Vec::new();
for c in 0..=255u16 {
text16.push(c);
}
let mut normalized16: Vec<u16> = Vec::new();
assert!(nfd.normalize_utf16_to(&text16, &mut normalized16).is_ok());
assert_eq!(&normalized[..], &normalized16[..]);
assert!(nfd.is_normalized_utf16(&normalized));
}

#[test]
fn test_latin1_normalize_nkfd_to() {
let mut text: Vec<u8> = Vec::new();
for c in 0..=255u8 {
text.push(c);
}
let mut normalized: Vec<u16> = Vec::new();
assert!(icu_normalizer::latin1::normalize_nfkd_to(&text, &mut normalized).is_ok());
let nfkd = DecomposingNormalizerBorrowed::new_nfkd();
let mut text16: Vec<u16> = Vec::new();
for c in 0..=255u16 {
text16.push(c);
}
let mut normalized16: Vec<u16> = Vec::new();
assert!(nfkd.normalize_utf16_to(&text16, &mut normalized16).is_ok());
assert_eq!(&normalized[..], &normalized16[..]);
assert!(nfkd.is_normalized_utf16(&normalized));
}

#[test]
fn test_latin1_normalize_nkfc_to() {
let mut text: Vec<u8> = Vec::new();
for c in 0..=255u8 {
text.push(c);
}
let mut normalized: Vec<u16> = Vec::new();
assert!(icu_normalizer::latin1::normalize_nfkc_to(&text, &mut normalized).is_ok());
let nfkc = ComposingNormalizerBorrowed::new_nfkc();
let mut text16: Vec<u16> = Vec::new();
for c in 0..=255u16 {
text16.push(c);
}
let mut normalized16: Vec<u16> = Vec::new();
assert!(nfkc.normalize_utf16_to(&text16, &mut normalized16).is_ok());
assert_eq!(&normalized[..], &normalized16[..]);
assert!(nfkc.is_normalized_utf16(&normalized));
}
Loading