Skip to content

Commit 09a6d2d

Browse files
committed
Add non-complex segmenter constructors (unicode-org#7268)
Users already do this with custom data (i.e. linebender/parley#436), we should provide an easier way. Fixes unicode-org#3612
1 parent 3237384 commit 09a6d2d

20 files changed

+767
-76
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ Several crates have had patch releases in the 2.1 stream:
1212
- Add `IndicConjunctBreak` (unicode-org#7280)
1313
- Add conversion between `icu::properties::props::Script` and `icu::locale::subtags:Script` (unicode-org#7270)
1414
- Add conversion between `icu::properties::props::BidiClass` and `unicode_bidi::BidiClass` (unicode-org#7272)
15+
- (2.1.2) `icu_segmenter`
16+
- Add non-complex line and word break constructors (unicode-org#7268)
17+
- Utils
18+
- Retire the `icu_harfbuzz` crate. The `icu_properties` and `icu_normalizer` types now directly implement the `harfbuzz-traits`
1519

1620
## icu4x 2.1
1721

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

components/segmenter/Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
[package]
66
name = "icu_segmenter"
77
description = "Unicode line breaking and text segmentation algorithms for text boundaries analysis"
8+
version = "2.1.2"
89

910
authors.workspace = true
1011
categories.workspace = true
@@ -14,7 +15,6 @@ include.workspace = true
1415
license.workspace = true
1516
repository.workspace = true
1617
rust-version.workspace = true
17-
version.workspace = true
1818

1919
[package.metadata.docs.rs]
2020
all-features = true
@@ -51,7 +51,7 @@ default = ["compiled_data", "auto"]
5151
serde = ["dep:serde", "potential_utf/serde", "zerovec/serde", "icu_collections/serde", "icu_provider/serde"]
5252
datagen = ["serde", "dep:databake", "potential_utf/databake", "zerovec/databake", "icu_collections/databake", "icu_provider/export"]
5353
lstm = ["dep:core_maths"]
54-
auto = ["lstm"] # Enabled try_new_auto_unstable constructors
54+
auto = ["lstm"] # Enables [try_]new_auto constructors
5555
compiled_data = ["dep:icu_segmenter_data", "dep:icu_locale", "icu_locale?/compiled_data", "icu_provider/baked"]
5656

5757
[lib]

components/segmenter/src/complex/mod.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,18 @@ impl ComplexPayloadsBorrowed<'static> {
256256
}
257257
}
258258

259+
#[cfg(feature = "compiled_data")]
260+
pub(crate) const fn empty() -> Self {
261+
Self {
262+
grapheme: GraphemeClusterSegmenter::new(),
263+
my: None,
264+
km: None,
265+
lo: None,
266+
th: None,
267+
ja: None,
268+
}
269+
}
270+
259271
pub(crate) fn static_to_owned(self) -> ComplexPayloads {
260272
ComplexPayloads {
261273
grapheme: self.grapheme.static_to_owned(),
@@ -379,6 +391,20 @@ impl ComplexPayloads {
379391
ja: None,
380392
})
381393
}
394+
395+
pub(crate) fn try_new_empty<D>(provider: &D) -> Result<Self, DataError>
396+
where
397+
D: DataProvider<SegmenterBreakGraphemeClusterV1> + ?Sized,
398+
{
399+
Ok(Self {
400+
grapheme: GraphemeClusterSegmenter::try_new_unstable(provider)?,
401+
my: None,
402+
km: None,
403+
lo: None,
404+
th: None,
405+
ja: None,
406+
})
407+
}
382408
}
383409
fn try_load<M: DataMarker, P: DataProvider<M> + ?Sized>(
384410
provider: &P,

components/segmenter/src/line.rs

Lines changed: 78 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use alloc::string::String;
1010
use alloc::vec;
1111
use alloc::vec::Vec;
1212
use core::char;
13-
use icu_locale_core::subtags::language;
13+
use icu_locale_core::subtags::{language, Language};
1414
use icu_locale_core::LanguageIdentifier;
1515
use icu_provider::prelude::*;
1616
use utf8_iter::Utf8CharIndices;
@@ -208,24 +208,42 @@ pub struct LineBreakOptions<'a> {
208208
pub content_locale: Option<&'a LanguageIdentifier>,
209209
}
210210

211+
impl LineBreakOptions<'_> {
212+
/// `const` version of [`Default::default`]
213+
pub const fn default() -> Self {
214+
Self {
215+
strictness: None,
216+
word_option: None,
217+
content_locale: None,
218+
}
219+
}
220+
}
221+
211222
#[derive(Debug, Clone, Copy)]
212223
struct ResolvedLineBreakOptions {
213224
strictness: LineBreakStrictness,
214225
word_option: LineBreakWordOption,
215226
ja_zh: bool,
216227
}
217228

218-
impl From<LineBreakOptions<'_>> for ResolvedLineBreakOptions {
219-
fn from(options: LineBreakOptions<'_>) -> Self {
220-
let ja_zh = if let Some(content_locale) = options.content_locale.as_ref() {
221-
content_locale.language == language!("ja") || content_locale.language == language!("zh")
222-
} else {
223-
false
224-
};
225-
Self {
226-
strictness: options.strictness.unwrap_or_default(),
227-
word_option: options.word_option.unwrap_or_default(),
228-
ja_zh,
229+
impl LineBreakOptions<'_> {
230+
const fn resolve(self) -> ResolvedLineBreakOptions {
231+
ResolvedLineBreakOptions {
232+
strictness: match self.strictness {
233+
Some(s) => s,
234+
None => LineBreakStrictness::Strict,
235+
},
236+
word_option: match self.word_option {
237+
Some(s) => s,
238+
None => LineBreakWordOption::Normal,
239+
},
240+
ja_zh: if let Some(content_locale) = self.content_locale.as_ref() {
241+
const JA: Language = language!("ja");
242+
const ZH: Language = language!("zh");
243+
matches!(content_locale.language, JA | ZH)
244+
} else {
245+
false
246+
},
229247
}
230248
}
231249
}
@@ -426,7 +444,7 @@ impl LineSegmenter {
426444
#[cfg(feature = "compiled_data")]
427445
pub fn new_lstm(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
428446
LineSegmenterBorrowed {
429-
options: options.into(),
447+
options: options.resolve(),
430448
data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
431449
complex: ComplexPayloadsBorrowed::new_lstm(),
432450
}
@@ -456,7 +474,7 @@ impl LineSegmenter {
456474
+ ?Sized,
457475
{
458476
Ok(Self {
459-
options: options.into(),
477+
options: options.resolve(),
460478
payload: provider.load(Default::default())?.payload,
461479
complex: ComplexPayloads::try_new_lstm(provider)?,
462480
})
@@ -468,15 +486,13 @@ impl LineSegmenter {
468486
/// The dictionary model uses a list of words to determine appropriate breakpoints. It is
469487
/// faster than the LSTM model but requires more data.
470488
///
471-
/// See also [`Self::new_dictionary`].
472-
///
473489
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
474490
///
475491
/// [📚 Help choosing a constructor](icu_provider::constructors)
476492
#[cfg(feature = "compiled_data")]
477493
pub fn new_dictionary(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
478494
LineSegmenterBorrowed {
479-
options: options.into(),
495+
options: options.resolve(),
480496
data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
481497
// Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
482498
// characters [1]. Southeast Asian languages however require complex context analysis
@@ -510,7 +526,7 @@ impl LineSegmenter {
510526
+ ?Sized,
511527
{
512528
Ok(Self {
513-
options: options.into(),
529+
options: options.resolve(),
514530
payload: provider.load(Default::default())?.payload,
515531
// Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
516532
// characters [1]. Southeast Asian languages however require complex context analysis
@@ -522,6 +538,50 @@ impl LineSegmenter {
522538
})
523539
}
524540

541+
/// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
542+
/// no support for scripts requiring complex context dependent line breaks (Khmer, Lao, Myanmar, Thai).
543+
///
544+
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
545+
///
546+
/// [📚 Help choosing a constructor](icu_provider::constructors)
547+
#[cfg(feature = "compiled_data")]
548+
pub const fn new_for_non_complex_scripts(
549+
options: LineBreakOptions,
550+
) -> LineSegmenterBorrowed<'static> {
551+
LineSegmenterBorrowed {
552+
options: options.resolve(),
553+
data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
554+
complex: ComplexPayloadsBorrowed::empty(),
555+
}
556+
}
557+
558+
icu_provider::gen_buffer_data_constructors!(
559+
(options: LineBreakOptions) -> error: DataError,
560+
functions: [
561+
new_for_non_complex_scripts: skip,
562+
try_new_for_non_complex_scripts_with_buffer_provider,
563+
try_new_for_non_complex_scripts_unstable,
564+
Self,
565+
]
566+
);
567+
568+
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_for_non_complex_scripts)]
569+
pub fn try_new_for_non_complex_scripts_unstable<D>(
570+
provider: &D,
571+
options: LineBreakOptions,
572+
) -> Result<Self, DataError>
573+
where
574+
D: DataProvider<SegmenterBreakLineV1>
575+
+ DataProvider<SegmenterBreakGraphemeClusterV1>
576+
+ ?Sized,
577+
{
578+
Ok(Self {
579+
options: options.resolve(),
580+
payload: provider.load(Default::default())?.payload,
581+
complex: ComplexPayloads::try_new_empty(provider)?,
582+
})
583+
}
584+
525585
/// Constructs a borrowed version of this type for more efficient querying.
526586
///
527587
/// Most useful methods for segmentation are on this type.

components/segmenter/src/word.rs

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,30 @@ pub struct WordBreakOptions<'a> {
5757
pub invariant_options: WordBreakInvariantOptions,
5858
}
5959

60+
impl WordBreakOptions<'_> {
61+
/// `const` version of [`Default::default`]
62+
pub const fn default() -> Self {
63+
Self {
64+
content_locale: None,
65+
invariant_options: WordBreakInvariantOptions::default(),
66+
}
67+
}
68+
}
69+
6070
/// Locale-independent options to tailor word breaking behavior
6171
///
6272
/// Currently empty but may grow in the future
6373
#[non_exhaustive]
6474
#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
6575
pub struct WordBreakInvariantOptions {}
6676

77+
impl WordBreakInvariantOptions {
78+
/// `const` version of [`Default::default`]
79+
pub const fn default() -> Self {
80+
Self {}
81+
}
82+
}
83+
6784
/// Implements the [`Iterator`] trait over the word boundaries of the given string.
6885
///
6986
/// Lifetimes:
@@ -513,6 +530,68 @@ impl WordSegmenter {
513530
},
514531
})
515532
}
533+
534+
/// Construct a [`WordSegmenter`] with an invariant locale and no support for
535+
/// scripts requiring complex context dependent word breaks (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai).
536+
///
537+
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
538+
///
539+
/// [📚 Help choosing a constructor](icu_provider::constructors)
540+
#[cfg(feature = "compiled_data")]
541+
pub const fn new_for_non_complex_scripts(
542+
_options: WordBreakInvariantOptions,
543+
) -> WordSegmenterBorrowed<'static> {
544+
WordSegmenterBorrowed {
545+
data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
546+
complex: ComplexPayloadsBorrowed::empty(),
547+
locale_override: None,
548+
}
549+
}
550+
551+
icu_provider::gen_buffer_data_constructors!(
552+
(options: WordBreakOptions) -> error: DataError,
553+
functions: [
554+
try_new_for_non_complex_scripts,
555+
try_new_for_non_complex_scripts_with_buffer_provider,
556+
try_new_for_non_complex_scripts_unstable,
557+
Self
558+
]
559+
);
560+
561+
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_for_non_complex_scripts)]
562+
pub fn try_new_for_non_complex_scripts_unstable<D>(
563+
provider: &D,
564+
options: WordBreakOptions,
565+
) -> Result<Self, DataError>
566+
where
567+
D: DataProvider<SegmenterBreakWordV1>
568+
+ DataProvider<SegmenterBreakWordOverrideV1>
569+
+ DataProvider<SegmenterBreakGraphemeClusterV1>
570+
+ ?Sized,
571+
{
572+
Ok(Self {
573+
payload: provider.load(Default::default())?.payload,
574+
complex: ComplexPayloads::try_new_empty(provider)?,
575+
payload_locale_override: if let Some(locale) = options.content_locale {
576+
let locale = DataLocale::from(locale);
577+
let req = DataRequest {
578+
id: DataIdentifierBorrowed::for_locale(&locale),
579+
metadata: {
580+
let mut metadata = DataRequestMetadata::default();
581+
metadata.silent = true;
582+
metadata
583+
},
584+
};
585+
provider
586+
.load(req)
587+
.allow_identifier_not_found()?
588+
.map(|r| r.payload)
589+
} else {
590+
None
591+
},
592+
})
593+
}
594+
516595
/// Constructs a borrowed version of this type for more efficient querying.
517596
///
518597
/// Most useful methods for segmentation are on this type.

ffi/capi/bindings/c/LineSegmenter.h

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ffi/capi/bindings/c/WordSegmenter.h

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)