unicode-org · robertbastian · Dec 18, 2025 · Dec 3, 2025 · Dec 9, 2025 · Dec 9, 2025
@@ -9,6 +9,8 @@
         - Add conversion between `icu::properties::props::Script` and `icu::locale::subtags:Script` (unicode-org#7270)
         - Add conversion between `icu::properties::props::BidiClass` and `unicode_bidi::BidiClass` (unicode-org#7272)
         - Constify `PropertyNamesLong`/`PropertNamesShort`/`PropertyParser` constructors (unicode-org#7294)
+    - `icu_segmenter`
+        - Add non-complex line and word break constructors (unicode-org#7268)
 - Utils
   - Retire the `icu_harfbuzz` crate. The `icu_properties` and `icu_normalizer` types now directly implement the `harfbuzz-traits`
 

@@ -5,6 +5,7 @@
 [package]
 name = "icu_segmenter"
 description = "Unicode line breaking and text segmentation algorithms for text boundaries analysis"
+version = "2.1.2"
 
 authors.workspace = true
 categories.workspace = true
@@ -14,7 +15,6 @@ include.workspace = true
 license.workspace = true
 repository.workspace = true
 rust-version.workspace = true
-version.workspace = true
 
 [package.metadata.docs.rs]
 all-features = true
@@ -52,7 +52,7 @@ default = ["compiled_data", "auto"]
 serde = ["dep:serde", "potential_utf/serde", "zerovec/serde", "icu_collections/serde", "icu_provider/serde"]
 datagen = ["serde", "dep:databake", "potential_utf/databake", "zerovec/databake", "icu_collections/databake", "icu_provider/export"]
 lstm = ["dep:core_maths"]
-auto = ["lstm"] # Enabled try_new_auto_unstable constructors
+auto = ["lstm"] # Enables [try_]new_auto constructors
 compiled_data = ["dep:icu_segmenter_data", "dep:icu_locale", "icu_locale?/compiled_data", "icu_provider/baked"]
 
 [lib]

@@ -256,6 +256,18 @@ impl ComplexPayloadsBorrowed<'static> {
         }
     }
 
+    #[cfg(feature = "compiled_data")]
+    pub(crate) const fn empty() -> Self {
+        Self {
+            grapheme: GraphemeClusterSegmenter::new(),
+            my: None,
+            km: None,
+            lo: None,
+            th: None,
+            ja: None,
+        }
+    }
+
     pub(crate) fn static_to_owned(self) -> ComplexPayloads {
         ComplexPayloads {
             grapheme: self.grapheme.static_to_owned(),
@@ -379,6 +391,20 @@ impl ComplexPayloads {
             ja: None,
         })
     }
+
+    pub(crate) fn try_new_empty<D>(provider: &D) -> Result<Self, DataError>
+    where
+        D: DataProvider<SegmenterBreakGraphemeClusterV1> + ?Sized,
+    {
+        Ok(Self {
+            grapheme: GraphemeClusterSegmenter::try_new_unstable(provider)?,
+            my: None,
+            km: None,
+            lo: None,
+            th: None,
+            ja: None,
+        })
+    }
 }
 fn try_load<M: DataMarker, P: DataProvider<M> + ?Sized>(
     provider: &P,

@@ -10,7 +10,7 @@ use alloc::string::String;
 use alloc::vec;
 use alloc::vec::Vec;
 use core::char;
-use icu_locale_core::subtags::language;
+use icu_locale_core::subtags::{language, Language};
 use icu_locale_core::LanguageIdentifier;
 use icu_provider::prelude::*;
 use utf8_iter::Utf8CharIndices;
@@ -208,24 +208,42 @@ pub struct LineBreakOptions<'a> {
     pub content_locale: Option<&'a LanguageIdentifier>,
 }
 
+impl LineBreakOptions<'_> {
+    /// `const` version of [`Default::default`]
+    pub const fn default() -> Self {
+        Self {
+            strictness: None,
+            word_option: None,
+            content_locale: None,
+        }
+    }
+}
+
 #[derive(Debug, Clone, Copy)]
 struct ResolvedLineBreakOptions {
     strictness: LineBreakStrictness,
     word_option: LineBreakWordOption,
     ja_zh: bool,
 }
 
-impl From<LineBreakOptions<'_>> for ResolvedLineBreakOptions {
-    fn from(options: LineBreakOptions<'_>) -> Self {
-        let ja_zh = if let Some(content_locale) = options.content_locale.as_ref() {
-            content_locale.language == language!("ja") || content_locale.language == language!("zh")
-        } else {
-            false
-        };
-        Self {
-            strictness: options.strictness.unwrap_or_default(),
-            word_option: options.word_option.unwrap_or_default(),
-            ja_zh,
+impl LineBreakOptions<'_> {
+    const fn resolve(self) -> ResolvedLineBreakOptions {
+        ResolvedLineBreakOptions {
+            strictness: match self.strictness {
+                Some(s) => s,
+                None => LineBreakStrictness::Strict,
+            },
+            word_option: match self.word_option {
+                Some(s) => s,
+                None => LineBreakWordOption::Normal,
+            },
+            ja_zh: if let Some(content_locale) = self.content_locale.as_ref() {
+                const JA: Language = language!("ja");
+                const ZH: Language = language!("zh");
+                matches!(content_locale.language, JA | ZH)
+            } else {
+                false
+            },
         }
     }
 }
@@ -426,7 +444,7 @@ impl LineSegmenter {
     #[cfg(feature = "compiled_data")]
     pub fn new_lstm(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
         LineSegmenterBorrowed {
-            options: options.into(),
+            options: options.resolve(),
             data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
             complex: ComplexPayloadsBorrowed::new_lstm(),
         }
@@ -456,7 +474,7 @@ impl LineSegmenter {
             + ?Sized,
     {
         Ok(Self {
-            options: options.into(),
+            options: options.resolve(),
             payload: provider.load(Default::default())?.payload,
             complex: ComplexPayloads::try_new_lstm(provider)?,
         })
@@ -468,15 +486,13 @@ impl LineSegmenter {
     /// The dictionary model uses a list of words to determine appropriate breakpoints. It is
     /// faster than the LSTM model but requires more data.
     ///
-    /// See also [`Self::new_dictionary`].
-    ///
     /// ✨ *Enabled with the `compiled_data` Cargo feature.*
     ///
     /// [📚 Help choosing a constructor](icu_provider::constructors)
     #[cfg(feature = "compiled_data")]
     pub fn new_dictionary(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
         LineSegmenterBorrowed {
-            options: options.into(),
+            options: options.resolve(),
             data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
             // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
             // characters [1]. Southeast Asian languages however require complex context analysis
@@ -510,7 +526,7 @@ impl LineSegmenter {
             + ?Sized,
     {
         Ok(Self {
-            options: options.into(),
+            options: options.resolve(),
             payload: provider.load(Default::default())?.payload,
             // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
             // characters [1]. Southeast Asian languages however require complex context analysis
@@ -522,6 +538,50 @@ impl LineSegmenter {
         })
     }
 
+    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
+    /// no support for scripts requiring complex context dependent line breaks (Khmer, Lao, Myanmar, Thai).
+    ///
+    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+    ///
+    /// [📚 Help choosing a constructor](icu_provider::constructors)
+    #[cfg(feature = "compiled_data")]
+    pub const fn new_for_non_complex_scripts(
+        options: LineBreakOptions,
+    ) -> LineSegmenterBorrowed<'static> {
+        LineSegmenterBorrowed {
+            options: options.resolve(),
+            data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
+            complex: ComplexPayloadsBorrowed::empty(),
+        }
+    }
+
+    icu_provider::gen_buffer_data_constructors!(
+        (options: LineBreakOptions) -> error: DataError,
+        functions: [
+            new_for_non_complex_scripts: skip,
+            try_new_for_non_complex_scripts_with_buffer_provider,
+            try_new_for_non_complex_scripts_unstable,
+            Self,
+        ]
+    );
+
+    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_for_non_complex_scripts)]
+    pub fn try_new_for_non_complex_scripts_unstable<D>(
+        provider: &D,
+        options: LineBreakOptions,
+    ) -> Result<Self, DataError>
+    where
+        D: DataProvider<SegmenterBreakLineV1>
+            + DataProvider<SegmenterBreakGraphemeClusterV1>
+            + ?Sized,
+    {
+        Ok(Self {
+            options: options.resolve(),
+            payload: provider.load(Default::default())?.payload,
+            complex: ComplexPayloads::try_new_empty(provider)?,
+        })
+    }
+
     /// Constructs a borrowed version of this type for more efficient querying.
     ///
     /// Most useful methods for segmentation are on this type.

@@ -57,13 +57,30 @@ pub struct WordBreakOptions<'a> {
     pub invariant_options: WordBreakInvariantOptions,
 }
 
+impl WordBreakOptions<'_> {
+    /// `const` version of [`Default::default`]
+    pub const fn default() -> Self {
+        Self {
+            content_locale: None,
+            invariant_options: WordBreakInvariantOptions::default(),
+        }
+    }
+}
+
 /// Locale-independent options to tailor word breaking behavior
 ///
 /// Currently empty but may grow in the future
 #[non_exhaustive]
 #[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
 pub struct WordBreakInvariantOptions {}
 
+impl WordBreakInvariantOptions {
+    /// `const` version of [`Default::default`]
+    pub const fn default() -> Self {
+        Self {}
+    }
+}
+
 /// Implements the [`Iterator`] trait over the word boundaries of the given string.
 ///
 /// Lifetimes:
@@ -513,6 +530,68 @@ impl WordSegmenter {
             },
         })
     }
+
+    /// Construct a [`WordSegmenter`] with an invariant locale and no support for
+    /// scripts requiring complex context dependent word breaks (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai).
+    ///
+    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+    ///
+    /// [📚 Help choosing a constructor](icu_provider::constructors)
+    #[cfg(feature = "compiled_data")]
+    pub const fn new_for_non_complex_scripts(
+        _options: WordBreakInvariantOptions,
+    ) -> WordSegmenterBorrowed<'static> {
+        WordSegmenterBorrowed {
+            data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
+            complex: ComplexPayloadsBorrowed::empty(),
+            locale_override: None,
+        }
+    }
+
+    icu_provider::gen_buffer_data_constructors!(
+        (options: WordBreakOptions) -> error: DataError,
+        functions: [
+            try_new_for_non_complex_scripts,
+            try_new_for_non_complex_scripts_with_buffer_provider,
+            try_new_for_non_complex_scripts_unstable,
+            Self
+        ]
+    );
+
+    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_for_non_complex_scripts)]
+    pub fn try_new_for_non_complex_scripts_unstable<D>(
+        provider: &D,
+        options: WordBreakOptions,
+    ) -> Result<Self, DataError>
+    where
+        D: DataProvider<SegmenterBreakWordV1>
+            + DataProvider<SegmenterBreakWordOverrideV1>
+            + DataProvider<SegmenterBreakGraphemeClusterV1>
+            + ?Sized,
+    {
+        Ok(Self {
+            payload: provider.load(Default::default())?.payload,
+            complex: ComplexPayloads::try_new_empty(provider)?,
+            payload_locale_override: if let Some(locale) = options.content_locale {
+                let locale = DataLocale::from(locale);
+                let req = DataRequest {
+                    id: DataIdentifierBorrowed::for_locale(&locale),
+                    metadata: {
+                        let mut metadata = DataRequestMetadata::default();
+                        metadata.silent = true;
+                        metadata
+                    },
+                };
+                provider
+                    .load(req)
+                    .allow_identifier_not_found()?
+                    .map(|r| r.payload)
+            } else {
+                None
+            },
+        })
+    }
+
     /// Constructs a borrowed version of this type for more efficient querying.
     ///
     /// Most useful methods for segmentation are on this type.