Skip to content

Commit 34531b8

Browse files
committed
Add word subcategories for more granular word movement
1 parent 07e7e75 commit 34531b8

File tree

2 files changed

+46
-11
lines changed

2 files changed

+46
-11
lines changed

helix-core/src/chars.rs

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,23 @@
22
33
use crate::LineEnding;
44

5+
#[derive(Debug, Eq, PartialEq)]
6+
pub enum WordCategory {
7+
Alphanumeric,
8+
Superscript,
9+
Subscript,
10+
Braille,
11+
Hiragana,
12+
Katakana,
13+
HangulSyllable,
14+
CJKIdeograph,
15+
}
16+
517
#[derive(Debug, Eq, PartialEq)]
618
pub enum CharCategory {
719
Whitespace,
820
Eol,
9-
Word,
21+
Word(WordCategory),
1022
Punctuation,
1123
Unknown,
1224
}
@@ -17,8 +29,8 @@ pub fn categorize_char(ch: char) -> CharCategory {
1729
CharCategory::Eol
1830
} else if ch.is_whitespace() {
1931
CharCategory::Whitespace
20-
} else if char_is_word(ch) {
21-
CharCategory::Word
32+
} else if let Some(cat) = char_word_category(ch) {
33+
CharCategory::Word(cat)
2234
} else if char_is_punctuation(ch) {
2335
CharCategory::Punctuation
2436
} else {
@@ -55,7 +67,7 @@ pub fn char_is_whitespace(ch: char) -> bool {
5567
// En Quad, Em Quad, En Space, Em Space, Three-per-em Space,
5668
// Four-per-em Space, Six-per-em Space, Figure Space,
5769
// Punctuation Space, Thin Space, Hair Space, Zero Width Space.
58-
ch if ('\u{2000}' ..= '\u{200B}').contains(&ch) => true,
70+
'\u{2000}' ..= '\u{200B}' => true,
5971

6072
_ => false,
6173
}
@@ -82,7 +94,31 @@ pub fn char_is_punctuation(ch: char) -> bool {
8294

8395
#[inline]
8496
pub fn char_is_word(ch: char) -> bool {
85-
ch.is_alphanumeric() || ch == '_'
97+
char_word_category(ch).is_some()
98+
}
99+
100+
pub fn char_word_category(ch: char) -> Option<WordCategory> {
101+
use WordCategory::*;
102+
103+
// Different subcategories so e.g. おはよう世界 is not treated as one block
104+
let level = match ch {
105+
'\u{2070}'..='\u{207f}' => Superscript,
106+
'\u{2080}'..='\u{2094}' => Subscript,
107+
'\u{2800}'..='\u{28ff}' => Braille,
108+
'\u{3040}'..='\u{309f}' => Hiragana,
109+
'\u{30a0}'..='\u{30ff}' => Katakana,
110+
'\u{ac00}'..='\u{d7a3}' => HangulSyllable,
111+
112+
'\u{3300}'..='\u{9fff}'
113+
| '\u{f900}'..='\u{faff}'
114+
| '\u{20000}'..='\u{2a6df}'
115+
| '\u{2a700}'..='\u{2b73f}'
116+
| '\u{2b740}'..='\u{2b81f}'
117+
| '\u{2f800}'..='\u{2fa1f}' => CJKIdeograph,
118+
ch if ch.is_alphanumeric() || ch == '_' => Alphanumeric,
119+
_ => return None,
120+
};
121+
Some(level)
86122
}
87123

88124
#[cfg(test)]
@@ -115,9 +151,8 @@ mod test {
115151
}
116152

117153
for ch in WORD_TEST_CASE.chars() {
118-
assert_eq!(
119-
CharCategory::Word,
120-
categorize_char(ch),
154+
assert!(
155+
matches!(categorize_char(ch), CharCategory::Word(_)),
121156
"Testing '{}', but got `{:?}` instead of `Category::Word`",
122157
ch,
123158
categorize_char(ch)

helix-core/src/movement.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -495,16 +495,16 @@ fn is_word_boundary(a: char, b: char) -> bool {
495495

496496
fn is_long_word_boundary(a: char, b: char) -> bool {
497497
match (categorize_char(a), categorize_char(b)) {
498-
(CharCategory::Word, CharCategory::Punctuation)
499-
| (CharCategory::Punctuation, CharCategory::Word) => false,
498+
(CharCategory::Word(_), CharCategory::Punctuation)
499+
| (CharCategory::Punctuation, CharCategory::Word(_)) => false,
500500
(a, b) if a != b => true,
501501
_ => false,
502502
}
503503
}
504504

505505
fn is_sub_word_boundary(a: char, b: char, dir: Direction) -> bool {
506506
match (categorize_char(a), categorize_char(b)) {
507-
(CharCategory::Word, CharCategory::Word) => {
507+
(CharCategory::Word(_), CharCategory::Word(_)) => {
508508
if (a == '_') != (b == '_') {
509509
return true;
510510
}

0 commit comments

Comments
 (0)