2
2
3
3
use crate :: LineEnding ;
4
4
5
+ #[ derive( Debug , Eq , PartialEq ) ]
6
+ pub enum WordCategory {
7
+ Alphanumeric ,
8
+ Superscript ,
9
+ Subscript ,
10
+ Braille ,
11
+ Hiragana ,
12
+ Katakana ,
13
+ HangulSyllable ,
14
+ CJKIdeograph ,
15
+ }
16
+
5
17
#[ derive( Debug , Eq , PartialEq ) ]
6
18
pub enum CharCategory {
7
19
Whitespace ,
8
20
Eol ,
9
- Word ,
21
+ Word ( WordCategory ) ,
10
22
Punctuation ,
11
23
Unknown ,
12
24
}
@@ -17,8 +29,8 @@ pub fn categorize_char(ch: char) -> CharCategory {
17
29
CharCategory :: Eol
18
30
} else if ch. is_whitespace ( ) {
19
31
CharCategory :: Whitespace
20
- } else if char_is_word ( ch) {
21
- CharCategory :: Word
32
+ } else if let Some ( cat ) = char_word_category ( ch) {
33
+ CharCategory :: Word ( cat )
22
34
} else if char_is_punctuation ( ch) {
23
35
CharCategory :: Punctuation
24
36
} else {
@@ -55,7 +67,7 @@ pub fn char_is_whitespace(ch: char) -> bool {
55
67
// En Quad, Em Quad, En Space, Em Space, Three-per-em Space,
56
68
// Four-per-em Space, Six-per-em Space, Figure Space,
57
69
// Punctuation Space, Thin Space, Hair Space, Zero Width Space.
58
- ch if ( '\u{2000}' ..= '\u{200B}' ) . contains ( & ch ) => true ,
70
+ '\u{2000}' ..= '\u{200B}' => true ,
59
71
60
72
_ => false ,
61
73
}
@@ -82,7 +94,31 @@ pub fn char_is_punctuation(ch: char) -> bool {
82
94
83
95
#[ inline]
84
96
pub fn char_is_word ( ch : char ) -> bool {
85
- ch. is_alphanumeric ( ) || ch == '_'
97
+ char_word_category ( ch) . is_some ( )
98
+ }
99
+
100
+ pub fn char_word_category ( ch : char ) -> Option < WordCategory > {
101
+ use WordCategory :: * ;
102
+
103
+ // Different subcategories so e.g. おはよう世界 is not treated as one block
104
+ let level = match ch {
105
+ '\u{2070}' ..='\u{207f}' => Superscript ,
106
+ '\u{2080}' ..='\u{2094}' => Subscript ,
107
+ '\u{2800}' ..='\u{28ff}' => Braille ,
108
+ '\u{3040}' ..='\u{309f}' => Hiragana ,
109
+ '\u{30a0}' ..='\u{30ff}' => Katakana ,
110
+ '\u{ac00}' ..='\u{d7a3}' => HangulSyllable ,
111
+
112
+ '\u{3300}' ..='\u{9fff}'
113
+ | '\u{f900}' ..='\u{faff}'
114
+ | '\u{20000}' ..='\u{2a6df}'
115
+ | '\u{2a700}' ..='\u{2b73f}'
116
+ | '\u{2b740}' ..='\u{2b81f}'
117
+ | '\u{2f800}' ..='\u{2fa1f}' => CJKIdeograph ,
118
+ ch if ch. is_alphanumeric ( ) || ch == '_' => Alphanumeric ,
119
+ _ => return None ,
120
+ } ;
121
+ Some ( level)
86
122
}
87
123
88
124
#[ cfg( test) ]
@@ -115,9 +151,8 @@ mod test {
115
151
}
116
152
117
153
for ch in WORD_TEST_CASE . chars ( ) {
118
- assert_eq ! (
119
- CharCategory :: Word ,
120
- categorize_char( ch) ,
154
+ assert ! (
155
+ matches!( categorize_char( ch) , CharCategory :: Word ( _) ) ,
121
156
"Testing '{}', but got `{:?}` instead of `Category::Word`" ,
122
157
ch,
123
158
categorize_char( ch)
0 commit comments