Skip to content

Commit 1809310

Browse files
committed
new_non_complex
1 parent d512d22 commit 1809310

File tree

4 files changed

+128
-3
lines changed

4 files changed

+128
-3
lines changed

components/segmenter/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ default = ["compiled_data", "auto"]
5252
serde = ["dep:serde", "potential_utf/serde", "zerovec/serde", "icu_collections/serde", "icu_provider/serde"]
5353
datagen = ["serde", "dep:databake", "potential_utf/databake", "zerovec/databake", "icu_collections/databake", "icu_provider/export"]
5454
lstm = ["dep:core_maths"]
55-
auto = ["lstm"] # Enabled try_new_auto_unstable constructors
55+
auto = ["lstm"] # Enables [try_]new_auto constructors
5656
compiled_data = ["dep:icu_segmenter_data", "dep:icu_locale", "icu_locale?/compiled_data", "icu_provider/baked"]
5757

5858
[lib]

components/segmenter/src/complex/mod.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,17 @@ impl ComplexPayloadsBorrowed<'static> {
256256
}
257257
}
258258

259+
pub(crate) fn empty() -> Self {
260+
Self {
261+
grapheme: GraphemeClusterSegmenter::new(),
262+
my: None,
263+
km: None,
264+
lo: None,
265+
th: None,
266+
ja: None,
267+
}
268+
}
269+
259270
pub(crate) fn static_to_owned(self) -> ComplexPayloads {
260271
ComplexPayloads {
261272
grapheme: self.grapheme.static_to_owned(),
@@ -379,6 +390,20 @@ impl ComplexPayloads {
379390
ja: None,
380391
})
381392
}
393+
394+
pub(crate) fn try_new_empty<D>(provider: &D) -> Result<Self, DataError>
395+
where
396+
D: DataProvider<SegmenterBreakGraphemeClusterV1> + ?Sized,
397+
{
398+
Ok(Self {
399+
grapheme: GraphemeClusterSegmenter::try_new_unstable(provider)?,
400+
my: None,
401+
km: None,
402+
lo: None,
403+
th: None,
404+
ja: None,
405+
})
406+
}
382407
}
383408
fn try_load<M: DataMarker, P: DataProvider<M> + ?Sized>(
384409
provider: &P,

components/segmenter/src/line.rs

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -468,8 +468,6 @@ impl LineSegmenter {
468468
/// The dictionary model uses a list of words to determine appropriate breakpoints. It is
469469
/// faster than the LSTM model but requires more data.
470470
///
471-
/// See also [`Self::new_dictionary`].
472-
///
473471
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
474472
///
475473
/// [📚 Help choosing a constructor](icu_provider::constructors)
@@ -522,6 +520,48 @@ impl LineSegmenter {
522520
})
523521
}
524522

523+
/// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
524+
/// no support for complex scripts (Khmer, Lao, Myanmar, Thai).
525+
///
526+
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
527+
///
528+
/// [📚 Help choosing a constructor](icu_provider::constructors)
529+
#[cfg(feature = "compiled_data")]
530+
pub fn new_non_complex(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
531+
LineSegmenterBorrowed {
532+
options: options.into(),
533+
data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
534+
complex: ComplexPayloadsBorrowed::empty(),
535+
}
536+
}
537+
538+
icu_provider::gen_buffer_data_constructors!(
539+
(options: LineBreakOptions) -> error: DataError,
540+
functions: [
541+
new_non_complex: skip,
542+
try_new_non_complex_with_buffer_provider,
543+
try_new_non_complex_unstable,
544+
Self,
545+
]
546+
);
547+
548+
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_non_complex)]
549+
pub fn try_new_non_complex_unstable<D>(
550+
provider: &D,
551+
options: LineBreakOptions,
552+
) -> Result<Self, DataError>
553+
where
554+
D: DataProvider<SegmenterBreakLineV1>
555+
+ DataProvider<SegmenterBreakGraphemeClusterV1>
556+
+ ?Sized,
557+
{
558+
Ok(Self {
559+
options: options.into(),
560+
payload: provider.load(Default::default())?.payload,
561+
complex: ComplexPayloads::try_new_empty(provider)?,
562+
})
563+
}
564+
525565
/// Constructs a borrowed version of this type for more efficient querying.
526566
///
527567
/// Most useful methods for segmentation are on this type.

components/segmenter/src/word.rs

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -513,6 +513,66 @@ impl WordSegmenter {
513513
},
514514
})
515515
}
516+
517+
/// Construct a [`WordSegmenter`] with an invariant locale and no support for
518+
/// complex scripts (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai).
519+
///
520+
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
521+
///
522+
/// [📚 Help choosing a constructor](icu_provider::constructors)
523+
#[cfg(feature = "compiled_data")]
524+
pub fn new_non_complex(_options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
525+
WordSegmenterBorrowed {
526+
data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
527+
complex: ComplexPayloadsBorrowed::empty(),
528+
locale_override: None,
529+
}
530+
}
531+
532+
icu_provider::gen_buffer_data_constructors!(
533+
(options: WordBreakOptions) -> error: DataError,
534+
functions: [
535+
try_new_non_complex,
536+
try_new_non_complex_with_buffer_provider,
537+
try_new_non_complex_unstable,
538+
Self
539+
]
540+
);
541+
542+
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_non_complex)]
543+
pub fn try_new_non_complex_unstable<D>(
544+
provider: &D,
545+
options: WordBreakOptions,
546+
) -> Result<Self, DataError>
547+
where
548+
D: DataProvider<SegmenterBreakWordV1>
549+
+ DataProvider<SegmenterBreakWordOverrideV1>
550+
+ DataProvider<SegmenterBreakGraphemeClusterV1>
551+
+ ?Sized,
552+
{
553+
Ok(Self {
554+
payload: provider.load(Default::default())?.payload,
555+
complex: ComplexPayloads::try_new_empty(provider)?,
556+
payload_locale_override: if let Some(locale) = options.content_locale {
557+
let locale = DataLocale::from(locale);
558+
let req = DataRequest {
559+
id: DataIdentifierBorrowed::for_locale(&locale),
560+
metadata: {
561+
let mut metadata = DataRequestMetadata::default();
562+
metadata.silent = true;
563+
metadata
564+
},
565+
};
566+
provider
567+
.load(req)
568+
.allow_identifier_not_found()?
569+
.map(|r| r.payload)
570+
} else {
571+
None
572+
},
573+
})
574+
}
575+
516576
/// Constructs a borrowed version of this type for more efficient querying.
517577
///
518578
/// Most useful methods for segmentation are on this type.

0 commit comments

Comments
 (0)