Skip to content

Commit 69b9f0e

Browse files
MStarhalucasmerlin
andauthored
Rework TextEdit arrow navigation to handle Unicode graphemes (#5812)
* [x] I have followed the instructions in the PR template Previously, navigating text in `TextEdit` with Ctrl + left/right arrow would jump inside words that contained combining characters (i.e. diacritics). This PR introduces new dependency of `unicode-segmentation` to handle grapheme encoding. The new implementation ignores whitespace and other separators such as `-` (dash) between words, but respects `_` (underscore). --------- Co-authored-by: lucasmerlin <[email protected]>
1 parent a0f072a commit 69b9f0e

File tree

5 files changed

+76
-19
lines changed

5 files changed

+76
-19
lines changed

Cargo.lock

+1
Original file line numberDiff line numberDiff line change
@@ -1285,6 +1285,7 @@ dependencies = [
12851285
"profiling",
12861286
"ron",
12871287
"serde",
1288+
"unicode-segmentation",
12881289
]
12891290

12901291
[[package]]

Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ serde = { version = "1", features = ["derive"] }
9999
similar-asserts = "1.4.2"
100100
thiserror = "1.0.37"
101101
type-map = "0.5.0"
102+
unicode-segmentation = "1.12.0"
102103
wasm-bindgen = "0.2"
103104
wasm-bindgen-futures = "0.4"
104105
web-sys = "0.3.73"

crates/egui/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ ahash.workspace = true
8787
bitflags.workspace = true
8888
nohash-hasher.workspace = true
8989
profiling.workspace = true
90+
unicode-segmentation.workspace = true
9091

9192
#! ### Optional dependencies
9293
accesskit = { workspace = true, optional = true }

crates/egui/src/text_selection/text_cursor_state.rs

+67-17
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
//! Text cursor changes/interaction, without modifying the text.
22
33
use epaint::text::{cursor::CCursor, Galley};
4+
use unicode_segmentation::UnicodeSegmentation;
45

56
use crate::{epaint, NumExt, Rect, Response, Ui};
67

@@ -166,7 +167,7 @@ fn select_line_at(text: &str, ccursor: CCursor) -> CCursorRange {
166167

167168
pub fn ccursor_next_word(text: &str, ccursor: CCursor) -> CCursor {
168169
CCursor {
169-
index: next_word_boundary_char_index(text.chars(), ccursor.index),
170+
index: next_word_boundary_char_index(text, ccursor.index),
170171
prefer_next_row: false,
171172
}
172173
}
@@ -180,9 +181,10 @@ fn ccursor_next_line(text: &str, ccursor: CCursor) -> CCursor {
180181

181182
pub fn ccursor_previous_word(text: &str, ccursor: CCursor) -> CCursor {
182183
let num_chars = text.chars().count();
184+
let reversed: String = text.graphemes(true).rev().collect();
183185
CCursor {
184186
index: num_chars
185-
- next_word_boundary_char_index(text.chars().rev(), num_chars - ccursor.index),
187+
- next_word_boundary_char_index(&reversed, num_chars - ccursor.index).min(num_chars),
186188
prefer_next_row: true,
187189
}
188190
}
@@ -196,22 +198,25 @@ fn ccursor_previous_line(text: &str, ccursor: CCursor) -> CCursor {
196198
}
197199
}
198200

199-
fn next_word_boundary_char_index(it: impl Iterator<Item = char>, mut index: usize) -> usize {
200-
let mut it = it.skip(index);
201-
if let Some(_first) = it.next() {
202-
index += 1;
203-
204-
if let Some(second) = it.next() {
205-
index += 1;
206-
for next in it {
207-
if is_word_char(next) != is_word_char(second) {
208-
break;
209-
}
210-
index += 1;
211-
}
201+
fn next_word_boundary_char_index(text: &str, index: usize) -> usize {
202+
for word in text.split_word_bound_indices() {
203+
// Splitting considers contiguous whitespace as one word, such words must be skipped,
204+
// this handles cases for example ' abc' (a space and a word), the cursor is at the beginning
205+
// (before space) - this jumps at the end of 'abc' (this is consistent with text editors
206+
// or browsers)
207+
let ci = char_index_from_byte_index(text, word.0);
208+
if ci > index && !skip_word(word.1) {
209+
return ci;
212210
}
213211
}
214-
index
212+
213+
char_index_from_byte_index(text, text.len())
214+
}
215+
216+
fn skip_word(text: &str) -> bool {
217+
// skip words that contain anything other than alphanumeric characters and underscore
218+
// (i.e. whitespace, dashes, etc.)
219+
!text.chars().any(|c| !is_word_char(c))
215220
}
216221

217222
fn next_line_boundary_char_index(it: impl Iterator<Item = char>, mut index: usize) -> usize {
@@ -233,7 +238,7 @@ fn next_line_boundary_char_index(it: impl Iterator<Item = char>, mut index: usiz
233238
}
234239

235240
pub fn is_word_char(c: char) -> bool {
236-
c.is_ascii_alphanumeric() || c == '_'
241+
c.is_alphanumeric() || c == '_'
237242
}
238243

239244
fn is_linebreak(c: char) -> bool {
@@ -270,6 +275,16 @@ pub fn byte_index_from_char_index(s: &str, char_index: usize) -> usize {
270275
s.len()
271276
}
272277

278+
pub fn char_index_from_byte_index(input: &str, byte_index: usize) -> usize {
279+
for (ci, (bi, _)) in input.char_indices().enumerate() {
280+
if bi == byte_index {
281+
return ci;
282+
}
283+
}
284+
285+
input.char_indices().last().map_or(0, |(i, _)| i + 1)
286+
}
287+
273288
pub fn slice_char_range(s: &str, char_range: std::ops::Range<usize>) -> &str {
274289
assert!(
275290
char_range.start <= char_range.end,
@@ -293,3 +308,38 @@ pub fn cursor_rect(galley: &Galley, cursor: &CCursor, row_height: f32) -> Rect {
293308

294309
cursor_pos
295310
}
311+
312+
#[cfg(test)]
313+
mod test {
314+
use crate::text_selection::text_cursor_state::next_word_boundary_char_index;
315+
316+
#[test]
317+
fn test_next_word_boundary_char_index() {
318+
// ASCII only
319+
let text = "abc d3f g_h i-j";
320+
assert_eq!(next_word_boundary_char_index(text, 1), 3);
321+
assert_eq!(next_word_boundary_char_index(text, 3), 7);
322+
assert_eq!(next_word_boundary_char_index(text, 9), 11);
323+
assert_eq!(next_word_boundary_char_index(text, 12), 13);
324+
assert_eq!(next_word_boundary_char_index(text, 13), 15);
325+
assert_eq!(next_word_boundary_char_index(text, 15), 15);
326+
327+
assert_eq!(next_word_boundary_char_index("", 0), 0);
328+
assert_eq!(next_word_boundary_char_index("", 1), 0);
329+
330+
// Unicode graphemes, some of which consist of multiple Unicode characters,
331+
// !!! Unicode character is not always what is tranditionally considered a character,
332+
// the values below are correct despite not seeming that way on the first look,
333+
// handling of and around emojis is kind of weird and is not consistent across
334+
// text editors and browsers
335+
let text = "❤️👍 skvělá knihovna 👍❤️";
336+
assert_eq!(next_word_boundary_char_index(text, 0), 2);
337+
assert_eq!(next_word_boundary_char_index(text, 2), 3); // this does not skip the space between thumbs-up and 'skvělá'
338+
assert_eq!(next_word_boundary_char_index(text, 6), 10);
339+
assert_eq!(next_word_boundary_char_index(text, 9), 10);
340+
assert_eq!(next_word_boundary_char_index(text, 12), 19);
341+
assert_eq!(next_word_boundary_char_index(text, 15), 19);
342+
assert_eq!(next_word_boundary_char_index(text, 19), 20);
343+
assert_eq!(next_word_boundary_char_index(text, 20), 21);
344+
}
345+
}

crates/egui/src/widgets/text_edit/text_buffer.rs

+6-2
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ use epaint::{
88
use crate::{
99
text::CCursorRange,
1010
text_selection::text_cursor_state::{
11-
byte_index_from_char_index, ccursor_next_word, ccursor_previous_word, find_line_start,
12-
slice_char_range,
11+
byte_index_from_char_index, ccursor_next_word, ccursor_previous_word,
12+
char_index_from_byte_index, find_line_start, slice_char_range,
1313
},
1414
};
1515

@@ -48,6 +48,10 @@ pub trait TextBuffer {
4848
byte_index_from_char_index(self.as_str(), char_index)
4949
}
5050

51+
fn char_index_from_byte_index(&self, char_index: usize) -> usize {
52+
char_index_from_byte_index(self.as_str(), char_index)
53+
}
54+
5155
/// Clears all characters in this buffer
5256
fn clear(&mut self) {
5357
self.delete_char_range(0..self.as_str().len());

0 commit comments

Comments
 (0)