Skip to content

Commit c6670f4

Browse files
authored
Data struct with codepoint builder for radicals (#7805)
Issue: #6941 Modified from #7722 and #7646 ## Changelog - `icu_segmenter`: Add unstable Unihan radical provider data and baked support * New types: `icu_segmenter::provider::UnihanIrgData<'data>`, `icu_segmenter::provider::SegmenterUnihanRadicalV1` * New associated const: `icu_segmenter::provider::Baked::SINGLETON_SEGMENTER_UNIHAN_RADICAL_V1` * The `experimental_segmenter` example now uses `radaboost` for the Chinese radical model and adds `thadaboost` for Thai - `icu_provider_source`: Add Unihan radical trie generation for `icu_segmenter::provider::SegmenterUnihanRadicalV1` * `SourceDataProvider` can now load this marker from Unihan IRG data - `icu_provider_registry`: Export `icu_segmenter::provider::SegmenterUnihanRadicalV1`
1 parent f4b6f65 commit c6670f4

File tree

21 files changed

+10976
-165
lines changed

21 files changed

+10976
-165
lines changed

components/icu/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ unstable = [
128128
"icu_calendar/unstable",
129129
"icu_datetime/unstable",
130130
"icu_plurals/unstable",
131+
"icu_segmenter/unstable",
131132
"icu_time/unstable",
132133
"dep:icu_experimental",
133134
"dep:icu_pattern",

components/segmenter/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ default = ["compiled_data", "auto"]
5656
serde = ["dep:serde", "potential_utf/serde", "zerovec/serde", "icu_collections/serde", "icu_provider/serde"]
5757
datagen = ["serde", "dep:databake", "potential_utf/databake", "zerovec/databake", "icu_collections/databake", "icu_provider/export"]
5858
lstm = ["dep:core_maths"]
59+
unstable = []
5960
auto = ["lstm"] # Enables [try_]new_auto constructors
6061
compiled_data = ["dep:icu_segmenter_data", "dep:icu_locale", "icu_locale?/compiled_data", "icu_provider/baked"]
6162

components/segmenter/examples/experimental_segmenter.rs

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,24 +14,48 @@ mod cnn;
1414
use adaboost::Predictor;
1515
use cnn::{CnnSegmenter, RawCnnData};
1616
use icu_segmenter::{options::WordBreakOptions, WordSegmenter, WordSegmenterBorrowed};
17-
use std::time::SystemTime;
17+
use std::time::Instant;
1818

1919
const REPETITIONS: usize = 1000;
2020

21-
fn main_adaboost(args: &[String]) {
21+
fn main_radaboost(args: &[String]) {
2222
let segmenter = Predictor::for_test();
2323
let s = &args[0];
24-
let start_time = SystemTime::now();
24+
let start_time = Instant::now();
2525
for _ in 0..REPETITIONS {
2626
segmenter.predict(s);
2727
}
28-
let elapsed = start_time.elapsed().unwrap();
28+
let elapsed = start_time.elapsed();
2929
println!("Output:");
3030
let mut prev = 0;
3131
for breakpoint in segmenter.predict_breakpoints(s) {
3232
print!("{}|", &s[prev..breakpoint]);
3333
prev = breakpoint;
3434
}
35+
if prev < s.len() {
36+
print!("{}", &s[prev..]);
37+
}
38+
println!();
39+
println!("{} repetitions done in: {:?}", REPETITIONS, elapsed);
40+
}
41+
42+
fn main_thadaboost(args: &[String]) {
43+
let segmenter = Predictor::for_test_thai();
44+
let s = &args[0];
45+
let start_time = Instant::now();
46+
for _ in 0..REPETITIONS {
47+
segmenter.predict_thai(s);
48+
}
49+
let elapsed = start_time.elapsed();
50+
println!("Output:");
51+
let mut prev = 0;
52+
for breakpoint in segmenter.predict_thai_breakpoints(s) {
53+
print!("{}|", &s[prev..breakpoint]);
54+
prev = breakpoint;
55+
}
56+
if prev < s.len() {
57+
print!("{}", &s[prev..]);
58+
}
3559
println!();
3660
println!("{} repetitions done in: {:?}", REPETITIONS, elapsed);
3761
}
@@ -55,11 +79,11 @@ fn main_cnn(args: &[String]) {
5579
.unwrap();
5680
let segmenter = CnnSegmenter::new(&cnndata);
5781
let s = &args[0];
58-
let start_time = SystemTime::now();
82+
let start_time = Instant::now();
5983
for _ in 0..REPETITIONS {
6084
segmenter.segment_str(s);
6185
}
62-
let elapsed = start_time.elapsed().unwrap();
86+
let elapsed = start_time.elapsed();
6387
println!("Output:");
6488
let mut prev = 0;
6589
for breakpoint in segmenter.segment_str(s).to_breakpoints() {
@@ -82,11 +106,11 @@ fn main_lstm(mut args: &[String]) {
82106
}
83107

84108
fn run_word_segmenter(segmenter: WordSegmenterBorrowed, s: &str) {
85-
let start_time = SystemTime::now();
109+
let start_time = Instant::now();
86110
for _ in 0..REPETITIONS {
87111
segmenter.segment_str(s).count(); // consume the iterator
88112
}
89-
let elapsed = start_time.elapsed().unwrap();
113+
let elapsed = start_time.elapsed();
90114
println!("Output:");
91115
let mut prev = 0;
92116
for breakpoint in segmenter.segment_str(s) {
@@ -104,7 +128,8 @@ fn main() {
104128
return;
105129
}
106130
match args[1].as_str() {
107-
"adaboost" => main_adaboost(&args[2..]),
131+
"radaboost" => main_radaboost(&args[2..]),
132+
"thadaboost" => main_thadaboost(&args[2..]),
108133
"dict" | "dictionary" => main_dict(&args[2..]),
109134
"cnn" => main_cnn(&args[2..]),
110135
"lstm" => main_lstm(&args[2..]),

components/segmenter/src/provider/mod.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
mod lstm;
1919
pub use lstm::*;
20+
#[cfg(feature = "unstable")]
21+
pub mod radical;
2022

2123
use crate::options::WordType;
2224
use icu_collections::codepointtrie::CodePointTrie;
@@ -51,6 +53,8 @@ const _: () = {
5153
impl_segmenter_break_line_v1!(Baked);
5254
#[cfg(feature = "lstm")]
5355
impl_segmenter_lstm_auto_v1!(Baked);
56+
#[cfg(feature = "unstable")]
57+
impl_segmenter_unihan_radical_v1!(Baked);
5458
impl_segmenter_break_word_v1!(Baked);
5559
impl_segmenter_break_word_override_v1!(Baked);
5660
impl_segmenter_break_sentence_override_v1!(Baked);
@@ -135,6 +139,8 @@ pub const MARKERS: &[DataMarkerInfo] = &[
135139
SegmenterDictionaryAutoV1::INFO,
136140
SegmenterDictionaryExtendedV1::INFO,
137141
SegmenterLstmAutoV1::INFO,
142+
#[cfg(feature = "unstable")]
143+
radical::SegmenterUnihanRadicalV1::INFO,
138144
];
139145

140146
/// Pre-processed Unicode data in the form of tables to be used for rule-based breaking.
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
// This file is part of ICU4X. For terms of use, please see the file
2+
// called LICENSE at the top level of the ICU4X source tree
3+
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4+
5+
//! Data provider struct definitions for radicals.
6+
7+
use icu_collections::codepointtrie::CodePointTrie;
8+
use icu_provider::prelude::*;
9+
10+
/// Data for Unihan radicals.
11+
#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
12+
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
13+
#[cfg_attr(feature = "datagen", databake(path = icu_segmenter::provider::radical))]
14+
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
15+
pub struct UnihanRadicalsData<'data> {
16+
/// Trie mapping code points to their IRG source radical ID (u8).
17+
#[cfg_attr(feature = "serde", serde(borrow))]
18+
pub trie: CodePointTrie<'data, u8>,
19+
}
20+
21+
icu_provider::data_struct!(
22+
UnihanRadicalsData<'_>,
23+
#[cfg(feature = "datagen")]
24+
);
25+
26+
icu_provider::data_marker!(
27+
/// Marker for the singleton trie mapping code points to their Unihan IRG source radical IDs.
28+
SegmenterUnihanRadicalV1,
29+
"segmenter/unihan/radical/v1",
30+
UnihanRadicalsData<'static>,
31+
is_singleton = true
32+
);

components/segmenter/tests/adaboost/main.rs

Lines changed: 27 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -4,59 +4,37 @@
44

55
#![allow(dead_code)]
66

7+
use icu_segmenter::provider::{radical::UnihanRadicalsData, Baked};
78
use std::collections::HashMap;
89

910
static MODEL_FOR_TEST: &str = include_str!("model.json");
1011
static MODEL_FOR_TEST_THAI: &str = include_str!("model_thai.json");
1112

12-
static CODEPOINTS: &[u16] = &[
13-
20008, 20022, 20031, 20057, 20101, 20108, 20128, 20154, 20799, 20837, 20843, 20866, 20886,
14-
20907, 20960, 20981, 20992, 21147, 21241, 21269, 21274, 21304, 21313, 21340, 21353, 21378,
15-
21430, 21448, 21475, 22231, 22303, 22763, 22786, 22794, 22805, 22823, 22899, 23376, 23424,
16-
23544, 23567, 23586, 23608, 23662, 23665, 24027, 24037, 24049, 24062, 24178, 24186, 24191,
17-
24308, 24318, 24331, 24339, 24400, 24417, 24435, 24515, 25096, 25142, 25163, 25903, 25908,
18-
25991, 26007, 26020, 26041, 26080, 26085, 26352, 26376, 26408, 27424, 27490, 27513, 27571,
19-
27595, 27604, 27611, 27663, 27668, 27700, 28779, 29226, 29238, 29243, 29247, 29255, 29273,
20-
29275, 29356, 29572, 29577, 29916, 29926, 29976, 29983, 29992, 30000, 30091, 30098, 30326,
21-
30333, 30382, 30399, 30446, 30683, 30690, 30707, 31034, 31160, 31166, 31348, 31435, 31481,
22-
31859, 31992, 32566, 32593, 32650, 32701, 32769, 32780, 32786, 32819, 32895, 32905, 33251,
23-
33258, 33267, 33276, 33292, 33307, 33311, 33390, 33394, 33400, 34381, 34411, 34880, 34892,
24-
34915, 35198, 35211, 35282, 35328, 35895, 35910, 35925, 35960, 35997, 36196, 36208, 36275,
25-
36523, 36554, 36763, 36784, 36789, 37009, 37193, 37318, 37324, 37329, 38263, 38272, 38428,
26-
38582, 38585, 38632, 38737, 38750, 38754, 38761, 38859, 38893, 38899, 38913, 39080, 39131,
27-
39135, 39318, 39321, 39340, 39592, 39640, 39647, 39717, 39727, 39730, 39740, 39770, 40165,
28-
40565, 40575, 40613, 40635, 40643, 40653, 40657, 40697, 40701, 40718, 40723, 40736, 40763,
29-
40778, 40786, 40845, 40860, 40864,
30-
];
31-
32-
pub(crate) fn get_radical(ch: char) -> u8 {
33-
let id = ch as u32;
34-
35-
if !(19968..=40869).contains(&id) {
36-
return 0;
37-
}
38-
39-
let idx = CODEPOINTS.partition_point(|&b| (b as u32) <= id);
40-
(idx as u8) + 1
13+
pub(crate) fn get_radical(radicals: &UnihanRadicalsData<'_>, ch: char) -> u8 {
14+
radicals.trie.get(ch)
4115
}
4216

43-
pub(crate) struct Predictor {
17+
pub(crate) struct Predictor<'a> {
4418
pub(crate) model: HashMap<String, HashMap<String, i16>>,
19+
radicals: &'a UnihanRadicalsData<'a>,
4520
}
4621

47-
impl Predictor {
48-
pub(crate) fn from_json(json: &str) -> Self {
22+
impl<'a> Predictor<'a> {
23+
pub(crate) fn from_json(json: &str, radicals: &'a UnihanRadicalsData<'a>) -> Self {
4924
let model: HashMap<String, HashMap<String, i16>> =
5025
serde_json::from_str(json).unwrap_or_default();
51-
Self { model }
26+
Self { model, radicals }
5227
}
5328

5429
pub(crate) fn for_test() -> Self {
55-
Self::from_json(MODEL_FOR_TEST)
30+
Self::from_json(MODEL_FOR_TEST, Baked::SINGLETON_SEGMENTER_UNIHAN_RADICAL_V1)
5631
}
5732

5833
pub(crate) fn for_test_thai() -> Self {
59-
Self::from_json(MODEL_FOR_TEST_THAI)
34+
Self::from_json(
35+
MODEL_FOR_TEST_THAI,
36+
Baked::SINGLETON_SEGMENTER_UNIHAN_RADICAL_V1,
37+
)
6038
}
6139

6240
pub(crate) fn predict(&self, sentence: &str) -> Vec<i16> {
@@ -73,15 +51,15 @@ impl Predictor {
7351

7452
let mut score: i16 = 4;
7553

76-
let rad4 = get_radical(c);
54+
let rad4 = get_radical(self.radicals, c);
7755
if rad4 != 0 {
7856
if let Some(map) = self.model.get("RSRID") {
7957
let key = format!("{}:{}", c_prev, rad4);
8058
score += map.get(&key).copied().unwrap_or(0);
8159
}
8260
}
8361

84-
let rad3 = get_radical(c_prev);
62+
let rad3 = get_radical(self.radicals, c_prev);
8563
if rad3 != 0 {
8664
if let Some(map) = self.model.get("LSRID") {
8765
let key = format!("{}:{}", rad3, c);
@@ -247,6 +225,18 @@ impl Predictor {
247225
}
248226
breakpoints
249227
}
228+
229+
pub(crate) fn predict_thai_breakpoints(&self, sentence: &str) -> Vec<usize> {
230+
let mut breakpoints = vec![0];
231+
let mut offset = 0;
232+
for (&score, ch) in self.predict_thai(sentence).iter().zip(sentence.chars()) {
233+
offset += ch.len_utf8();
234+
if score > 0 {
235+
breakpoints.push(offset);
236+
}
237+
}
238+
breakpoints
239+
}
250240
}
251241

252242
#[cfg(test)]

components/segmenter/tests/adaboost/python_test_output.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
2404
2525
-1604
2626
2824
27-
440
27+
320
2828
-525
2929
5881
3030
4892
@@ -46,4 +46,4 @@
4646
4605
4747
4324
4848
-2139
49-
5215
49+
5215

provider/data/segmenter/data/mod.rs

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

provider/data/segmenter/data/segmenter_unihan_radical_v1.rs.data

Lines changed: 85 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

provider/data/segmenter/fingerprints.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,4 @@ segmenter/lstm/auto/v1, und/Burmese_codepoints_exclusive_model4_heavy, 91365B, 9
2424
segmenter/lstm/auto/v1, und/Khmer_codepoints_exclusive_model4_heavy, 74665B, 74368B, cc6fe9f66fed196d
2525
segmenter/lstm/auto/v1, und/Lao_codepoints_exclusive_model4_heavy, 72160B, 71863B, 3f52a4025c7d618f
2626
segmenter/lstm/auto/v1, und/Thai_codepoints_exclusive_model4_heavy, 72327B, 72030B, 4486b38238d7c651
27+
segmenter/unihan/radical/v1, <singleton>, 13080B, 13020B, d875e702fbedc249

0 commit comments

Comments
 (0)