Skip to content

Commit c93669e

Browse files
committed
new_non_complex
1 parent d512d22 commit c93669e

File tree

20 files changed

+763
-70
lines changed

20 files changed

+763
-70
lines changed

components/segmenter/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ default = ["compiled_data", "auto"]
5252
serde = ["dep:serde", "potential_utf/serde", "zerovec/serde", "icu_collections/serde", "icu_provider/serde"]
5353
datagen = ["serde", "dep:databake", "potential_utf/databake", "zerovec/databake", "icu_collections/databake", "icu_provider/export"]
5454
lstm = ["dep:core_maths"]
55-
auto = ["lstm"] # Enabled try_new_auto_unstable constructors
55+
auto = ["lstm"] # Enables [try_]new_auto constructors
5656
compiled_data = ["dep:icu_segmenter_data", "dep:icu_locale", "icu_locale?/compiled_data", "icu_provider/baked"]
5757

5858
[lib]

components/segmenter/src/complex/mod.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,18 @@ impl ComplexPayloadsBorrowed<'static> {
256256
}
257257
}
258258

259+
#[cfg(feature = "compiled_data")]
260+
pub(crate) fn empty() -> Self {
261+
Self {
262+
grapheme: GraphemeClusterSegmenter::new(),
263+
my: None,
264+
km: None,
265+
lo: None,
266+
th: None,
267+
ja: None,
268+
}
269+
}
270+
259271
pub(crate) fn static_to_owned(self) -> ComplexPayloads {
260272
ComplexPayloads {
261273
grapheme: self.grapheme.static_to_owned(),
@@ -379,6 +391,20 @@ impl ComplexPayloads {
379391
ja: None,
380392
})
381393
}
394+
395+
pub(crate) fn try_new_empty<D>(provider: &D) -> Result<Self, DataError>
396+
where
397+
D: DataProvider<SegmenterBreakGraphemeClusterV1> + ?Sized,
398+
{
399+
Ok(Self {
400+
grapheme: GraphemeClusterSegmenter::try_new_unstable(provider)?,
401+
my: None,
402+
km: None,
403+
lo: None,
404+
th: None,
405+
ja: None,
406+
})
407+
}
382408
}
383409
fn try_load<M: DataMarker, P: DataProvider<M> + ?Sized>(
384410
provider: &P,

components/segmenter/src/line.rs

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -468,8 +468,6 @@ impl LineSegmenter {
468468
/// The dictionary model uses a list of words to determine appropriate breakpoints. It is
469469
/// faster than the LSTM model but requires more data.
470470
///
471-
/// See also [`Self::new_dictionary`].
472-
///
473471
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
474472
///
475473
/// [📚 Help choosing a constructor](icu_provider::constructors)
@@ -522,6 +520,48 @@ impl LineSegmenter {
522520
})
523521
}
524522

523+
/// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
524+
/// no support for complex scripts (Khmer, Lao, Myanmar, Thai).
525+
///
526+
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
527+
///
528+
/// [📚 Help choosing a constructor](icu_provider::constructors)
529+
#[cfg(feature = "compiled_data")]
530+
pub fn new_non_complex(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
531+
LineSegmenterBorrowed {
532+
options: options.into(),
533+
data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
534+
complex: ComplexPayloadsBorrowed::empty(),
535+
}
536+
}
537+
538+
icu_provider::gen_buffer_data_constructors!(
539+
(options: LineBreakOptions) -> error: DataError,
540+
functions: [
541+
new_non_complex: skip,
542+
try_new_non_complex_with_buffer_provider,
543+
try_new_non_complex_unstable,
544+
Self,
545+
]
546+
);
547+
548+
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_non_complex)]
549+
pub fn try_new_non_complex_unstable<D>(
550+
provider: &D,
551+
options: LineBreakOptions,
552+
) -> Result<Self, DataError>
553+
where
554+
D: DataProvider<SegmenterBreakLineV1>
555+
+ DataProvider<SegmenterBreakGraphemeClusterV1>
556+
+ ?Sized,
557+
{
558+
Ok(Self {
559+
options: options.into(),
560+
payload: provider.load(Default::default())?.payload,
561+
complex: ComplexPayloads::try_new_empty(provider)?,
562+
})
563+
}
564+
525565
/// Constructs a borrowed version of this type for more efficient querying.
526566
///
527567
/// Most useful methods for segmentation are on this type.

components/segmenter/src/word.rs

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -513,6 +513,66 @@ impl WordSegmenter {
513513
},
514514
})
515515
}
516+
517+
/// Construct a [`WordSegmenter`] with an invariant locale and no support for
518+
/// complex scripts (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai).
519+
///
520+
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
521+
///
522+
/// [📚 Help choosing a constructor](icu_provider::constructors)
523+
#[cfg(feature = "compiled_data")]
524+
pub fn new_non_complex(_options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
525+
WordSegmenterBorrowed {
526+
data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
527+
complex: ComplexPayloadsBorrowed::empty(),
528+
locale_override: None,
529+
}
530+
}
531+
532+
icu_provider::gen_buffer_data_constructors!(
533+
(options: WordBreakOptions) -> error: DataError,
534+
functions: [
535+
try_new_non_complex,
536+
try_new_non_complex_with_buffer_provider,
537+
try_new_non_complex_unstable,
538+
Self
539+
]
540+
);
541+
542+
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_non_complex)]
543+
pub fn try_new_non_complex_unstable<D>(
544+
provider: &D,
545+
options: WordBreakOptions,
546+
) -> Result<Self, DataError>
547+
where
548+
D: DataProvider<SegmenterBreakWordV1>
549+
+ DataProvider<SegmenterBreakWordOverrideV1>
550+
+ DataProvider<SegmenterBreakGraphemeClusterV1>
551+
+ ?Sized,
552+
{
553+
Ok(Self {
554+
payload: provider.load(Default::default())?.payload,
555+
complex: ComplexPayloads::try_new_empty(provider)?,
556+
payload_locale_override: if let Some(locale) = options.content_locale {
557+
let locale = DataLocale::from(locale);
558+
let req = DataRequest {
559+
id: DataIdentifierBorrowed::for_locale(&locale),
560+
metadata: {
561+
let mut metadata = DataRequestMetadata::default();
562+
metadata.silent = true;
563+
metadata
564+
},
565+
};
566+
provider
567+
.load(req)
568+
.allow_identifier_not_found()?
569+
.map(|r| r.payload)
570+
} else {
571+
None
572+
},
573+
})
574+
}
575+
516576
/// Constructs a borrowed version of this type for more efficient querying.
517577
///
518578
/// Most useful methods for segmentation are on this type.

ffi/capi/bindings/c/LineSegmenter.h

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ffi/capi/bindings/c/WordSegmenter.h

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ffi/capi/bindings/cpp/icu4x/LineSegmenter.d.hpp

Lines changed: 24 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ffi/capi/bindings/cpp/icu4x/LineSegmenter.hpp

Lines changed: 25 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)