Skip to content

Rework TextEdit arrow navigation to handle Unicode graphemes #5812

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1285,6 +1285,7 @@ dependencies = [
"profiling",
"ron",
"serde",
"unicode-segmentation",
]

[[package]]
Expand Down
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ serde = { version = "1", features = ["derive"] }
similar-asserts = "1.4.2"
thiserror = "1.0.37"
type-map = "0.5.0"
unicode-segmentation = "1.12.0"
wasm-bindgen = "0.2"
wasm-bindgen-futures = "0.4"
web-sys = "0.3.73"
Expand Down
1 change: 1 addition & 0 deletions crates/egui/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ ahash.workspace = true
bitflags.workspace = true
nohash-hasher.workspace = true
profiling.workspace = true
unicode-segmentation.workspace = true

#! ### Optional dependencies
accesskit = { workspace = true, optional = true }
Expand Down
84 changes: 67 additions & 17 deletions crates/egui/src/text_selection/text_cursor_state.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//! Text cursor changes/interaction, without modifying the text.

use epaint::text::{cursor::CCursor, Galley};
use unicode_segmentation::UnicodeSegmentation;

use crate::{epaint, NumExt, Rect, Response, Ui};

Expand Down Expand Up @@ -166,7 +167,7 @@ fn select_line_at(text: &str, ccursor: CCursor) -> CCursorRange {

pub fn ccursor_next_word(text: &str, ccursor: CCursor) -> CCursor {
CCursor {
index: next_word_boundary_char_index(text.chars(), ccursor.index),
index: next_word_boundary_char_index(text, ccursor.index),
prefer_next_row: false,
}
}
Expand All @@ -180,9 +181,10 @@ fn ccursor_next_line(text: &str, ccursor: CCursor) -> CCursor {

pub fn ccursor_previous_word(text: &str, ccursor: CCursor) -> CCursor {
let num_chars = text.chars().count();
let reversed: String = text.graphemes(true).rev().collect();
CCursor {
index: num_chars
- next_word_boundary_char_index(text.chars().rev(), num_chars - ccursor.index),
- next_word_boundary_char_index(&reversed, num_chars - ccursor.index).min(num_chars),
prefer_next_row: true,
}
}
Expand All @@ -196,22 +198,25 @@ fn ccursor_previous_line(text: &str, ccursor: CCursor) -> CCursor {
}
}

fn next_word_boundary_char_index(it: impl Iterator<Item = char>, mut index: usize) -> usize {
let mut it = it.skip(index);
if let Some(_first) = it.next() {
index += 1;

if let Some(second) = it.next() {
index += 1;
for next in it {
if is_word_char(next) != is_word_char(second) {
break;
}
index += 1;
}
fn next_word_boundary_char_index(text: &str, index: usize) -> usize {
for word in text.split_word_bound_indices() {
// Splitting considers contiguous whitespace as one word, such words must be skipped,
// this handles cases for example ' abc' (a space and a word), the cursor is at the beginning
// (before space) - this jumps at the end of 'abc' (this is consistent with text editors
// or browsers)
let ci = char_index_from_byte_index(text, word.0);
if ci > index && !skip_word(word.1) {
return ci;
}
}
index

char_index_from_byte_index(text, text.len())
}

fn skip_word(text: &str) -> bool {
// skip words that contain anything other than alphanumeric characters and underscore
// (i.e. whitespace, dashes, etc.)
!text.chars().any(|c| !is_word_char(c))
}

fn next_line_boundary_char_index(it: impl Iterator<Item = char>, mut index: usize) -> usize {
Expand All @@ -233,7 +238,7 @@ fn next_line_boundary_char_index(it: impl Iterator<Item = char>, mut index: usiz
}

pub fn is_word_char(c: char) -> bool {
c.is_ascii_alphanumeric() || c == '_'
c.is_alphanumeric() || c == '_'
}

fn is_linebreak(c: char) -> bool {
Expand Down Expand Up @@ -270,6 +275,16 @@ pub fn byte_index_from_char_index(s: &str, char_index: usize) -> usize {
s.len()
}

pub fn char_index_from_byte_index(input: &str, byte_index: usize) -> usize {
for (ci, (bi, _)) in input.char_indices().enumerate() {
if bi == byte_index {
return ci;
}
}

input.char_indices().last().map_or(0, |(i, _)| i + 1)
}

pub fn slice_char_range(s: &str, char_range: std::ops::Range<usize>) -> &str {
assert!(
char_range.start <= char_range.end,
Expand All @@ -293,3 +308,38 @@ pub fn cursor_rect(galley: &Galley, cursor: &CCursor, row_height: f32) -> Rect {

cursor_pos
}

#[cfg(test)]
mod test {
use crate::text_selection::text_cursor_state::next_word_boundary_char_index;

#[test]
fn test_next_word_boundary_char_index() {
// ASCII only
let text = "abc d3f g_h i-j";
assert_eq!(next_word_boundary_char_index(text, 1), 3);
assert_eq!(next_word_boundary_char_index(text, 3), 7);
assert_eq!(next_word_boundary_char_index(text, 9), 11);
assert_eq!(next_word_boundary_char_index(text, 12), 13);
assert_eq!(next_word_boundary_char_index(text, 13), 15);
assert_eq!(next_word_boundary_char_index(text, 15), 15);

assert_eq!(next_word_boundary_char_index("", 0), 0);
assert_eq!(next_word_boundary_char_index("", 1), 0);

// Unicode graphemes, some of which consist of multiple Unicode characters,
// !!! Unicode character is not always what is tranditionally considered a character,
// the values below are correct despite not seeming that way on the first look,
// handling of and around emojis is kind of weird and is not consistent across
// text editors and browsers
let text = "❤️👍 skvělá knihovna 👍❤️";
assert_eq!(next_word_boundary_char_index(text, 0), 2);
assert_eq!(next_word_boundary_char_index(text, 2), 3); // this does not skip the space between thumbs-up and 'skvělá'
assert_eq!(next_word_boundary_char_index(text, 6), 10);
assert_eq!(next_word_boundary_char_index(text, 9), 10);
assert_eq!(next_word_boundary_char_index(text, 12), 19);
assert_eq!(next_word_boundary_char_index(text, 15), 19);
assert_eq!(next_word_boundary_char_index(text, 19), 20);
assert_eq!(next_word_boundary_char_index(text, 20), 21);
}
}
8 changes: 6 additions & 2 deletions crates/egui/src/widgets/text_edit/text_buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ use epaint::{
use crate::{
text::CCursorRange,
text_selection::text_cursor_state::{
byte_index_from_char_index, ccursor_next_word, ccursor_previous_word, find_line_start,
slice_char_range,
byte_index_from_char_index, ccursor_next_word, ccursor_previous_word,
char_index_from_byte_index, find_line_start, slice_char_range,
},
};

Expand Down Expand Up @@ -48,6 +48,10 @@ pub trait TextBuffer {
byte_index_from_char_index(self.as_str(), char_index)
}

fn char_index_from_byte_index(&self, char_index: usize) -> usize {
char_index_from_byte_index(self.as_str(), char_index)
}

/// Clears all characters in this buffer
fn clear(&mut self) {
self.delete_char_range(0..self.as_str().len());
Expand Down
Loading