Skip to content

Commit 627ec92

Browse files
committed
wip
1 parent e1d1a5c commit 627ec92

File tree

9 files changed

+58
-29
lines changed

9 files changed

+58
-29
lines changed

helix-core/src/chars.rs

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use crate::LineEnding;
66
pub enum CharCategory {
77
Whitespace,
88
Eol,
9-
Word,
9+
Word(u8),
1010
Punctuation,
1111
Unknown,
1212
}
@@ -17,8 +17,8 @@ pub fn categorize_char(ch: char) -> CharCategory {
1717
CharCategory::Eol
1818
} else if ch.is_whitespace() {
1919
CharCategory::Whitespace
20-
} else if char_is_word(ch) {
21-
CharCategory::Word
20+
} else if let Some(n) = char_is_word(ch) {
21+
CharCategory::Word(n)
2222
} else if char_is_punctuation(ch) {
2323
CharCategory::Punctuation
2424
} else {
@@ -55,7 +55,7 @@ pub fn char_is_whitespace(ch: char) -> bool {
5555
// En Quad, Em Quad, En Space, Em Space, Three-per-em Space,
5656
// Four-per-em Space, Six-per-em Space, Figure Space,
5757
// Punctuation Space, Thin Space, Hair Space, Zero Width Space.
58-
ch if ('\u{2000}' ..= '\u{200B}').contains(&ch) => true,
58+
'\u{2000}' ..= '\u{200B}' => true,
5959

6060
_ => false,
6161
}
@@ -81,8 +81,26 @@ pub fn char_is_punctuation(ch: char) -> bool {
8181
}
8282

8383
#[inline]
84-
pub fn char_is_word(ch: char) -> bool {
85-
ch.is_alphanumeric() || ch == '_'
84+
pub fn char_is_word(ch: char) -> Option<u8> {
85+
// Different subcategories so e.g. おはよう世界 is not treated as one block
86+
let res = match ch {
87+
'\u{2070}'..='\u{207f}' => 1, // Superscript
88+
'\u{2080}'..='\u{2094}' => 2, // Subscript
89+
'\u{2800}'..='\u{28ff}' => 3, // Braille
90+
'\u{3040}'..='\u{309f}' => 4, // Hiragana
91+
'\u{30a0}'..='\u{30ff}' => 5, // Katakana
92+
'\u{ac00}'..='\u{d7a3}' => 6, // Hangul syllables
93+
// CJK Ideographs
94+
'\u{3300}'..='\u{9fff}'
95+
| '\u{f900}'..='\u{faff}'
96+
| '\u{20000}'..='\u{2a6df}'
97+
| '\u{2a700}'..='\u{2b73f}'
98+
| '\u{2b740}'..='\u{2b81f}'
99+
| '\u{2f800}'..='\u{2fa1f}' => 7,
100+
ch if ch.is_alphanumeric() || ch == '_' => 0,
101+
_ => return None,
102+
};
103+
Some(res)
86104
}
87105

88106
#[cfg(test)]
@@ -115,9 +133,8 @@ mod test {
115133
}
116134

117135
for ch in WORD_TEST_CASE.chars() {
118-
assert_eq!(
119-
CharCategory::Word,
120-
categorize_char(ch),
136+
assert!(
137+
matches!(categorize_char(ch), CharCategory::Word(_)),
121138
"Testing '{}', but got `{:?}` instead of `Category::Word`",
122139
ch,
123140
categorize_char(ch)

helix-core/src/graphemes.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ impl<'a> Grapheme<'a> {
7272
// This could however be improved in the future by considering unicode
7373
// character classes but
7474
pub fn is_word_boundary(&self) -> bool {
75-
!matches!(&self, Grapheme::Other { g,.. } if g.chars().all(char_is_word))
75+
!matches!(&self, Grapheme::Other { g,.. } if g.chars().all(|ch| char_is_word(ch).is_some()))
7676
}
7777
}
7878

helix-core/src/movement.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -495,16 +495,16 @@ fn is_word_boundary(a: char, b: char) -> bool {
495495

496496
fn is_long_word_boundary(a: char, b: char) -> bool {
497497
match (categorize_char(a), categorize_char(b)) {
498-
(CharCategory::Word, CharCategory::Punctuation)
499-
| (CharCategory::Punctuation, CharCategory::Word) => false,
498+
(CharCategory::Word(_), CharCategory::Punctuation)
499+
| (CharCategory::Punctuation, CharCategory::Word(_)) => false,
500500
(a, b) if a != b => true,
501501
_ => false,
502502
}
503503
}
504504

505505
fn is_sub_word_boundary(a: char, b: char, dir: Direction) -> bool {
506506
match (categorize_char(a), categorize_char(b)) {
507-
(CharCategory::Word, CharCategory::Word) => {
507+
(CharCategory::Word(_), CharCategory::Word(_)) => {
508508
if (a == '_') != (b == '_') {
509509
return true;
510510
}

helix-core/src/transaction.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,16 @@ impl Assoc {
4747
let chars = s.chars().count();
4848
match self {
4949
Assoc::After | Assoc::AfterSticky => chars,
50-
Assoc::AfterWord => s.chars().take_while(|&c| char_is_word(c)).count(),
50+
Assoc::AfterWord => s.chars().take_while(|&c| char_is_word(c).is_some()).count(),
5151
// return position before inserted text
5252
Assoc::Before | Assoc::BeforeSticky => 0,
53-
Assoc::BeforeWord => chars - s.chars().rev().take_while(|&c| char_is_word(c)).count(),
53+
Assoc::BeforeWord => {
54+
chars
55+
- s.chars()
56+
.rev()
57+
.take_while(|&c| char_is_word(c).is_some())
58+
.count()
59+
}
5460
}
5561
}
5662

helix-lsp/src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -278,13 +278,13 @@ pub mod util {
278278
- text
279279
.chars_at(cursor)
280280
.reversed()
281-
.take_while(|ch| chars::char_is_word(*ch))
281+
.take_while(|ch| chars::char_is_word(*ch).is_some())
282282
.count();
283283
let mut end = cursor;
284284
if replace_mode {
285285
end += text
286286
.chars_at(cursor)
287-
.take_while(|ch| chars::char_is_word(*ch))
287+
.take_while(|ch| chars::char_is_word(*ch).is_some())
288288
.count();
289289
}
290290
(start, end)

helix-term/src/commands.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2254,11 +2254,11 @@ fn search_selection_impl(cx: &mut Context, detect_word_boundaries: bool) {
22542254
fn is_at_word_start(text: RopeSlice, index: usize) -> bool {
22552255
let ch = text.char(index);
22562256
if index == 0 {
2257-
return char_is_word(ch);
2257+
return char_is_word(ch).is_some();
22582258
}
22592259
let prev_ch = text.char(index - 1);
22602260

2261-
!char_is_word(prev_ch) && char_is_word(ch)
2261+
!char_is_word(prev_ch).is_some() && char_is_word(ch).is_some()
22622262
}
22632263

22642264
fn is_at_word_end(text: RopeSlice, index: usize) -> bool {
@@ -2268,7 +2268,7 @@ fn search_selection_impl(cx: &mut Context, detect_word_boundaries: bool) {
22682268
let ch = text.char(index);
22692269
let prev_ch = text.char(index - 1);
22702270

2271-
char_is_word(prev_ch) && !char_is_word(ch)
2271+
char_is_word(prev_ch).is_some() && !char_is_word(ch).is_some()
22722272
}
22732273

22742274
let register = cx.register.unwrap_or('/');
@@ -6358,7 +6358,7 @@ fn jump_to_word(cx: &mut Context, behaviour: Movement) {
63586358
// the same char class as a word so `=<` would also count as a word.
63596359
let add_label = RevRopeGraphemes::new(text.slice(..cursor_fwd.head))
63606360
.take(2)
6361-
.take_while(|g| g.chars().all(char_is_word))
6361+
.take_while(|g| g.chars().all(|c| char_is_word(c).is_some()))
63626362
.count()
63636363
== 2;
63646364
if !add_label {
@@ -6368,7 +6368,7 @@ fn jump_to_word(cx: &mut Context, behaviour: Movement) {
63686368
// skip any leading whitespace
63696369
cursor_fwd.anchor += text
63706370
.chars_at(cursor_fwd.anchor)
6371-
.take_while(|&c| !char_is_word(c))
6371+
.take_while(|&c| char_is_word(c).is_none())
63726372
.count();
63736373
words.push(cursor_fwd);
63746374
if words.len() == jump_label_limit {
@@ -6384,7 +6384,7 @@ fn jump_to_word(cx: &mut Context, behaviour: Movement) {
63846384
// the same char class as a word so `=<` would also count as a word.
63856385
let add_label = RopeGraphemes::new(text.slice(cursor_rev.head..))
63866386
.take(2)
6387-
.take_while(|g| g.chars().all(char_is_word))
6387+
.take_while(|g| g.chars().all(|c| char_is_word(c).is_some()))
63886388
.count()
63896389
== 2;
63906390
if !add_label {
@@ -6394,7 +6394,7 @@ fn jump_to_word(cx: &mut Context, behaviour: Movement) {
63946394
cursor_rev.anchor -= text
63956395
.chars_at(cursor_rev.anchor)
63966396
.reversed()
6397-
.take_while(|&c| !char_is_word(c))
6397+
.take_while(|&c| char_is_word(c).is_none())
63986398
.count();
63996399
words.push(cursor_rev);
64006400
if words.len() == jump_label_limit {

helix-term/src/handlers/completion.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@ pub fn trigger_auto_completion(
381381
.chars_at(cursor)
382382
.reversed()
383383
.take(config.completion_trigger_len as usize)
384-
.all(char_is_word);
384+
.all(|c| char_is_word(c).is_some());
385385

386386
if is_auto_trigger {
387387
send_blocking(

helix-term/src/ui/completion.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ impl Completion {
378378
let offset = text
379379
.chars_at(cursor)
380380
.reversed()
381-
.take_while(|ch| chars::char_is_word(*ch))
381+
.take_while(|ch| chars::char_is_word(*ch).is_some())
382382
.count();
383383
let start_offset = cursor.saturating_sub(offset);
384384

helix-view/src/document.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1969,9 +1969,15 @@ impl Document {
19691969
Vec::new()
19701970
};
19711971

1972-
let ends_at_word =
1973-
start != end && end != 0 && text.get_char(end - 1).map_or(false, char_is_word);
1974-
let starts_at_word = start != end && text.get_char(start).map_or(false, char_is_word);
1972+
let ends_at_word = start != end
1973+
&& end != 0
1974+
&& text
1975+
.get_char(end - 1)
1976+
.map_or(false, |ch| char_is_word(ch).is_some());
1977+
let starts_at_word = start != end
1978+
&& text
1979+
.get_char(start)
1980+
.map_or(false, |ch| char_is_word(ch).is_some());
19751981

19761982
Some(Diagnostic {
19771983
range: Range { start, end },

0 commit comments

Comments
 (0)