Rework TextEdit arrow navigation to handle Unicode graphemes (#5812)

MStarha · lucasmerlin · web-flow · commit 69b9f0eede49 · 2025-04-22T17:44:10.000+02:00
* [x] I have followed the instructions in the PR template

Previously, navigating text in `TextEdit` with Ctrl + left/right arrow
would jump inside words that contained combining characters (i.e.
diacritics). This PR introduces new dependency of `unicode-segmentation`
to handle grapheme encoding. The new implementation ignores whitespace
and other separators such as `-` (dash) between words, but respects `_`
(underscore).

---------

Co-authored-by: lucasmerlin &lt;hi@lucasmerlin.me&gt;
diff --git a/Cargo.lock b/Cargo.lock
@@ -1285,6 +1285,7 @@ dependencies = [
  "profiling",
  "ron",
  "serde",
+ "unicode-segmentation",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
@@ -99,6 +99,7 @@ serde = { version = "1", features = ["derive"] }
 similar-asserts = "1.4.2"
 thiserror = "1.0.37"
 type-map = "0.5.0"
+unicode-segmentation = "1.12.0"
 wasm-bindgen = "0.2"
 wasm-bindgen-futures = "0.4"
 web-sys = "0.3.73"
diff --git a/crates/egui/Cargo.toml b/crates/egui/Cargo.toml
@@ -87,6 +87,7 @@ ahash.workspace = true
 bitflags.workspace = true
 nohash-hasher.workspace = true
 profiling.workspace = true
+unicode-segmentation.workspace = true
 
 #! ### Optional dependencies
 accesskit = { workspace = true, optional = true }
diff --git a/crates/egui/src/text_selection/text_cursor_state.rs b/crates/egui/src/text_selection/text_cursor_state.rs
@@ -1,6 +1,7 @@
 //! Text cursor changes/interaction, without modifying the text.
 
 use epaint::text::{cursor::CCursor, Galley};
+use unicode_segmentation::UnicodeSegmentation;
 
 use crate::{epaint, NumExt, Rect, Response, Ui};
 
@@ -166,7 +167,7 @@ fn select_line_at(text: &str, ccursor: CCursor) -> CCursorRange {
 
 pub fn ccursor_next_word(text: &str, ccursor: CCursor) -> CCursor {
     CCursor {
-        index: next_word_boundary_char_index(text.chars(), ccursor.index),
+        index: next_word_boundary_char_index(text, ccursor.index),
         prefer_next_row: false,
     }
 }
@@ -180,9 +181,10 @@ fn ccursor_next_line(text: &str, ccursor: CCursor) -> CCursor {
 
 pub fn ccursor_previous_word(text: &str, ccursor: CCursor) -> CCursor {
     let num_chars = text.chars().count();
+    let reversed: String = text.graphemes(true).rev().collect();
     CCursor {
         index: num_chars
-            - next_word_boundary_char_index(text.chars().rev(), num_chars - ccursor.index),
+            - next_word_boundary_char_index(&reversed, num_chars - ccursor.index).min(num_chars),
         prefer_next_row: true,
     }
 }
@@ -196,22 +198,25 @@ fn ccursor_previous_line(text: &str, ccursor: CCursor) -> CCursor {
     }
 }
 
-fn next_word_boundary_char_index(it: impl Iterator<Item = char>, mut index: usize) -> usize {
-    let mut it = it.skip(index);
-    if let Some(_first) = it.next() {
-        index += 1;
-
-        if let Some(second) = it.next() {
-            index += 1;
-            for next in it {
-                if is_word_char(next) != is_word_char(second) {
-                    break;
-                }
-                index += 1;
-            }
+fn next_word_boundary_char_index(text: &str, index: usize) -> usize {
+    for word in text.split_word_bound_indices() {
+        // Splitting considers contiguous whitespace as one word, such words must be skipped,
+        // this handles cases for example ' abc' (a space and a word), the cursor is at the beginning
+        // (before space) - this jumps at the end of 'abc' (this is consistent with text editors
+        // or browsers)
+        let ci = char_index_from_byte_index(text, word.0);
+        if ci > index && !skip_word(word.1) {
+            return ci;
         }
     }
-    index
+
+    char_index_from_byte_index(text, text.len())
+}
+
+fn skip_word(text: &str) -> bool {
+    // skip words that contain anything other than alphanumeric characters and underscore
+    // (i.e. whitespace, dashes, etc.)
+    !text.chars().any(|c| !is_word_char(c))
 }
 
 fn next_line_boundary_char_index(it: impl Iterator<Item = char>, mut index: usize) -> usize {
@@ -233,7 +238,7 @@ fn next_line_boundary_char_index(it: impl Iterator<Item = char>, mut index: usiz
 }
 
 pub fn is_word_char(c: char) -> bool {
-    c.is_ascii_alphanumeric() || c == '_'
+    c.is_alphanumeric() || c == '_'
 }
 
 fn is_linebreak(c: char) -> bool {
@@ -270,6 +275,16 @@ pub fn byte_index_from_char_index(s: &str, char_index: usize) -> usize {
     s.len()
 }
 
+pub fn char_index_from_byte_index(input: &str, byte_index: usize) -> usize {
+    for (ci, (bi, _)) in input.char_indices().enumerate() {
+        if bi == byte_index {
+            return ci;
+        }
+    }
+
+    input.char_indices().last().map_or(0, |(i, _)| i + 1)
+}
+
 pub fn slice_char_range(s: &str, char_range: std::ops::Range<usize>) -> &str {
     assert!(
         char_range.start <= char_range.end,
@@ -293,3 +308,38 @@ pub fn cursor_rect(galley: &Galley, cursor: &CCursor, row_height: f32) -> Rect {
 
     cursor_pos
 }
+
+#[cfg(test)]
+mod test {
+    use crate::text_selection::text_cursor_state::next_word_boundary_char_index;
+
+    #[test]
+    fn test_next_word_boundary_char_index() {
+        // ASCII only
+        let text = "abc d3f g_h i-j";
+        assert_eq!(next_word_boundary_char_index(text, 1), 3);
+        assert_eq!(next_word_boundary_char_index(text, 3), 7);
+        assert_eq!(next_word_boundary_char_index(text, 9), 11);
+        assert_eq!(next_word_boundary_char_index(text, 12), 13);
+        assert_eq!(next_word_boundary_char_index(text, 13), 15);
+        assert_eq!(next_word_boundary_char_index(text, 15), 15);
+
+        assert_eq!(next_word_boundary_char_index("", 0), 0);
+        assert_eq!(next_word_boundary_char_index("", 1), 0);
+
+        // Unicode graphemes, some of which consist of multiple Unicode characters,
+        // !!! Unicode character is not always what is tranditionally considered a character,
+        // the values below are correct despite not seeming that way on the first look,
+        // handling of and around emojis is kind of weird and is not consistent across
+        // text editors and browsers
+        let text = "❤️👍 skvělá knihovna 👍❤️";
+        assert_eq!(next_word_boundary_char_index(text, 0), 2);
+        assert_eq!(next_word_boundary_char_index(text, 2), 3); // this does not skip the space between thumbs-up and 'skvělá'
+        assert_eq!(next_word_boundary_char_index(text, 6), 10);
+        assert_eq!(next_word_boundary_char_index(text, 9), 10);
+        assert_eq!(next_word_boundary_char_index(text, 12), 19);
+        assert_eq!(next_word_boundary_char_index(text, 15), 19);
+        assert_eq!(next_word_boundary_char_index(text, 19), 20);
+        assert_eq!(next_word_boundary_char_index(text, 20), 21);
+    }
+}
diff --git a/crates/egui/src/widgets/text_edit/text_buffer.rs b/crates/egui/src/widgets/text_edit/text_buffer.rs
@@ -8,8 +8,8 @@ use epaint::{
 use crate::{
     text::CCursorRange,
     text_selection::text_cursor_state::{
-        byte_index_from_char_index, ccursor_next_word, ccursor_previous_word, find_line_start,
-        slice_char_range,
+        byte_index_from_char_index, ccursor_next_word, ccursor_previous_word,
+        char_index_from_byte_index, find_line_start, slice_char_range,
     },
 };
 
@@ -48,6 +48,10 @@ pub trait TextBuffer {
         byte_index_from_char_index(self.as_str(), char_index)
     }
 
+    fn char_index_from_byte_index(&self, char_index: usize) -> usize {
+        char_index_from_byte_index(self.as_str(), char_index)
+    }
+
     /// Clears all characters in this buffer
     fn clear(&mut self) {
         self.delete_char_range(0..self.as_str().len());

Original file line number	Diff line number	Diff line change
`@@ -1285,6 +1285,7 @@ dependencies = [`
`1285`	`1285`	`"profiling",`
`1286`	`1286`	`"ron",`
`1287`	`1287`	`"serde",`
	`1288`	`+ "unicode-segmentation",`
`1288`	`1289`	`]`
`1289`	`1290`
`1290`	`1291`	`[[package]]`