Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
- Add conversion between `icu::properties::props::Script` and `icu::locale::subtags:Script` (unicode-org#7270)
- Add conversion between `icu::properties::props::BidiClass` and `unicode_bidi::BidiClass` (unicode-org#7272)
- Constify `PropertyNamesLong`/`PropertNamesShort`/`PropertyParser` constructors (unicode-org#7294)
- `icu_segmenter`
- Add non-complex line and word break constructors (unicode-org#7268)
- Utils
- Retire the `icu_harfbuzz` crate. The `icu_properties` and `icu_normalizer` types now directly implement the `harfbuzz-traits`

Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions components/segmenter/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
[package]
name = "icu_segmenter"
description = "Unicode line breaking and text segmentation algorithms for text boundaries analysis"
version = "2.1.2"

authors.workspace = true
categories.workspace = true
Expand All @@ -14,7 +15,6 @@ include.workspace = true
license.workspace = true
repository.workspace = true
rust-version.workspace = true
version.workspace = true

[package.metadata.docs.rs]
all-features = true
Expand Down Expand Up @@ -52,7 +52,7 @@ default = ["compiled_data", "auto"]
serde = ["dep:serde", "potential_utf/serde", "zerovec/serde", "icu_collections/serde", "icu_provider/serde"]
datagen = ["serde", "dep:databake", "potential_utf/databake", "zerovec/databake", "icu_collections/databake", "icu_provider/export"]
lstm = ["dep:core_maths"]
auto = ["lstm"] # Enabled try_new_auto_unstable constructors
auto = ["lstm"] # Enables [try_]new_auto constructors
compiled_data = ["dep:icu_segmenter_data", "dep:icu_locale", "icu_locale?/compiled_data", "icu_provider/baked"]

[lib]
Expand Down
26 changes: 26 additions & 0 deletions components/segmenter/src/complex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,18 @@ impl ComplexPayloadsBorrowed<'static> {
}
}

#[cfg(feature = "compiled_data")]
pub(crate) const fn empty() -> Self {
Self {
grapheme: GraphemeClusterSegmenter::new(),
my: None,
km: None,
lo: None,
th: None,
ja: None,
}
}

pub(crate) fn static_to_owned(self) -> ComplexPayloads {
ComplexPayloads {
grapheme: self.grapheme.static_to_owned(),
Expand Down Expand Up @@ -379,6 +391,20 @@ impl ComplexPayloads {
ja: None,
})
}

pub(crate) fn try_new_empty<D>(provider: &D) -> Result<Self, DataError>
where
D: DataProvider<SegmenterBreakGraphemeClusterV1> + ?Sized,
{
Ok(Self {
grapheme: GraphemeClusterSegmenter::try_new_unstable(provider)?,
my: None,
km: None,
lo: None,
th: None,
ja: None,
})
}
}
fn try_load<M: DataMarker, P: DataProvider<M> + ?Sized>(
provider: &P,
Expand Down
96 changes: 78 additions & 18 deletions components/segmenter/src/line.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use alloc::string::String;
use alloc::vec;
use alloc::vec::Vec;
use core::char;
use icu_locale_core::subtags::language;
use icu_locale_core::subtags::{language, Language};
use icu_locale_core::LanguageIdentifier;
use icu_provider::prelude::*;
use utf8_iter::Utf8CharIndices;
Expand Down Expand Up @@ -208,24 +208,42 @@ pub struct LineBreakOptions<'a> {
pub content_locale: Option<&'a LanguageIdentifier>,
}

impl LineBreakOptions<'_> {
/// `const` version of [`Default::default`]
pub const fn default() -> Self {
Self {
strictness: None,
word_option: None,
content_locale: None,
}
}
}

#[derive(Debug, Clone, Copy)]
struct ResolvedLineBreakOptions {
strictness: LineBreakStrictness,
word_option: LineBreakWordOption,
ja_zh: bool,
}

impl From<LineBreakOptions<'_>> for ResolvedLineBreakOptions {
fn from(options: LineBreakOptions<'_>) -> Self {
let ja_zh = if let Some(content_locale) = options.content_locale.as_ref() {
content_locale.language == language!("ja") || content_locale.language == language!("zh")
} else {
false
};
Self {
strictness: options.strictness.unwrap_or_default(),
word_option: options.word_option.unwrap_or_default(),
ja_zh,
impl LineBreakOptions<'_> {
const fn resolve(self) -> ResolvedLineBreakOptions {
ResolvedLineBreakOptions {
strictness: match self.strictness {
Some(s) => s,
None => LineBreakStrictness::Strict,
},
word_option: match self.word_option {
Some(s) => s,
None => LineBreakWordOption::Normal,
},
ja_zh: if let Some(content_locale) = self.content_locale.as_ref() {
const JA: Language = language!("ja");
const ZH: Language = language!("zh");
matches!(content_locale.language, JA | ZH)
} else {
false
},
}
}
}
Expand Down Expand Up @@ -426,7 +444,7 @@ impl LineSegmenter {
#[cfg(feature = "compiled_data")]
pub fn new_lstm(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
LineSegmenterBorrowed {
options: options.into(),
options: options.resolve(),
data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
complex: ComplexPayloadsBorrowed::new_lstm(),
}
Expand Down Expand Up @@ -456,7 +474,7 @@ impl LineSegmenter {
+ ?Sized,
{
Ok(Self {
options: options.into(),
options: options.resolve(),
payload: provider.load(Default::default())?.payload,
complex: ComplexPayloads::try_new_lstm(provider)?,
})
Expand All @@ -468,15 +486,13 @@ impl LineSegmenter {
/// The dictionary model uses a list of words to determine appropriate breakpoints. It is
/// faster than the LSTM model but requires more data.
///
/// See also [`Self::new_dictionary`].
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub fn new_dictionary(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
LineSegmenterBorrowed {
options: options.into(),
options: options.resolve(),
data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
// Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
// characters [1]. Southeast Asian languages however require complex context analysis
Expand Down Expand Up @@ -510,7 +526,7 @@ impl LineSegmenter {
+ ?Sized,
{
Ok(Self {
options: options.into(),
options: options.resolve(),
payload: provider.load(Default::default())?.payload,
// Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
// characters [1]. Southeast Asian languages however require complex context analysis
Expand All @@ -522,6 +538,50 @@ impl LineSegmenter {
})
}

/// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
/// no support for scripts requiring complex context dependent line breaks (Khmer, Lao, Myanmar, Thai).
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub const fn new_for_non_complex_scripts(
options: LineBreakOptions,
) -> LineSegmenterBorrowed<'static> {
LineSegmenterBorrowed {
options: options.resolve(),
data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
complex: ComplexPayloadsBorrowed::empty(),
}
}

icu_provider::gen_buffer_data_constructors!(
(options: LineBreakOptions) -> error: DataError,
functions: [
new_for_non_complex_scripts: skip,
try_new_for_non_complex_scripts_with_buffer_provider,
try_new_for_non_complex_scripts_unstable,
Self,
]
);

#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_for_non_complex_scripts)]
pub fn try_new_for_non_complex_scripts_unstable<D>(
provider: &D,
options: LineBreakOptions,
) -> Result<Self, DataError>
where
D: DataProvider<SegmenterBreakLineV1>
+ DataProvider<SegmenterBreakGraphemeClusterV1>
+ ?Sized,
{
Ok(Self {
options: options.resolve(),
payload: provider.load(Default::default())?.payload,
complex: ComplexPayloads::try_new_empty(provider)?,
})
}

/// Constructs a borrowed version of this type for more efficient querying.
///
/// Most useful methods for segmentation are on this type.
Expand Down
79 changes: 79 additions & 0 deletions components/segmenter/src/word.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,30 @@ pub struct WordBreakOptions<'a> {
pub invariant_options: WordBreakInvariantOptions,
}

impl WordBreakOptions<'_> {
/// `const` version of [`Default::default`]
pub const fn default() -> Self {
Self {
content_locale: None,
invariant_options: WordBreakInvariantOptions::default(),
}
}
}

/// Locale-independent options to tailor word breaking behavior
///
/// Currently empty but may grow in the future
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
pub struct WordBreakInvariantOptions {}

impl WordBreakInvariantOptions {
/// `const` version of [`Default::default`]
pub const fn default() -> Self {
Self {}
}
}

/// Implements the [`Iterator`] trait over the word boundaries of the given string.
///
/// Lifetimes:
Expand Down Expand Up @@ -513,6 +530,68 @@ impl WordSegmenter {
},
})
}

/// Construct a [`WordSegmenter`] with an invariant locale and no support for
/// scripts requiring complex context dependent word breaks (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai).
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub const fn new_for_non_complex_scripts(
_options: WordBreakInvariantOptions,
) -> WordSegmenterBorrowed<'static> {
WordSegmenterBorrowed {
data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
complex: ComplexPayloadsBorrowed::empty(),
locale_override: None,
}
}

icu_provider::gen_buffer_data_constructors!(
(options: WordBreakOptions) -> error: DataError,
functions: [
try_new_for_non_complex_scripts,
try_new_for_non_complex_scripts_with_buffer_provider,
try_new_for_non_complex_scripts_unstable,
Self
]
);

#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_for_non_complex_scripts)]
pub fn try_new_for_non_complex_scripts_unstable<D>(
provider: &D,
options: WordBreakOptions,
) -> Result<Self, DataError>
where
D: DataProvider<SegmenterBreakWordV1>
+ DataProvider<SegmenterBreakWordOverrideV1>
+ DataProvider<SegmenterBreakGraphemeClusterV1>
+ ?Sized,
{
Ok(Self {
payload: provider.load(Default::default())?.payload,
complex: ComplexPayloads::try_new_empty(provider)?,
payload_locale_override: if let Some(locale) = options.content_locale {
let locale = DataLocale::from(locale);
let req = DataRequest {
id: DataIdentifierBorrowed::for_locale(&locale),
metadata: {
let mut metadata = DataRequestMetadata::default();
metadata.silent = true;
metadata
},
};
provider
.load(req)
.allow_identifier_not_found()?
.map(|r| r.payload)
} else {
None
},
})
}

/// Constructs a borrowed version of this type for more efficient querying.
///
/// Most useful methods for segmentation are on this type.
Expand Down
7 changes: 7 additions & 0 deletions ffi/capi/bindings/c/LineSegmenter.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions ffi/capi/bindings/c/WordSegmenter.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading