Skip to content

Commit c2603e2

Browse files
author
bors-servo
authored
Auto merge of #36 - behnam:runseq, r=mbrubeck
Improve basic conformance and add more unit tests Continue my review of code based on spec and fix various steps that drop the basic conformance failures from 12827 to 250. * X10: Fix incorrect index for `succ_level` resolution (off by `end_of_seq` because of sub-sequencing) and add unit tests from the spec text, from both BD13 and X10 sections. * N1/N2: Fix resetting rule for NI levels by definint `e`, as noted in the spec. * L1: Fix not resetting leading whitespace/formatting chars in a line. * Use non-ASCII sample chars for `gen_char_from_bidi_class()`, which increases the failure rate from 250 to 314. * This allows easier testing of the logic, specially having specific cases from the conformance test as unit test, as corner bugs unfold. * There's almost no extra cost in expanding the shortcut condition to check all the isolating run sequences to be LTR, if there's more than one. This can prevent a bunch of string manipulation when all isolating run sequences are LTR. <!-- Reviewable:start --> --- This change is [<img src="https://reviewable.io/review_button.svg" height="34" align="absmiddle" alt="Reviewable"/>](https://reviewable.io/reviews/servo/unicode-bidi/36) <!-- Reviewable:end -->
2 parents 36c0603 + bacbb67 commit c2603e2

File tree

7 files changed

+352
-109
lines changed

7 files changed

+352
-109
lines changed

benches/basic.rs

+3-8
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,10 @@ use test::Bencher;
1818
use unicode_bidi::BidiInfo;
1919

2020

21-
const LTR_TEXTS: &[&str] = &[
22-
"abc\ndef\nghi",
23-
"abc 123\ndef 456\nghi 789",
24-
];
21+
const LTR_TEXTS: &[&str] = &["abc\ndef\nghi", "abc 123\ndef 456\nghi 789"];
2522

26-
const BIDI_TEXTS: &[&str] = &[
27-
"ابجد\nهوز\nحتی",
28-
"ابجد ۱۲۳\nهوز ۴۵۶\nحتی ۷۸۹",
29-
];
23+
const BIDI_TEXTS: &[&str] =
24+
&["ابجد\nهوز\nحتی", "ابجد ۱۲۳\nهوز ۴۵۶\nحتی ۷۸۹"];
3025

3126

3227
fn bench_bidi_info_new(b: &mut Bencher, texts: &[&str]) {

src/explicit.rs

+6-6
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@ use BidiClass::*;
2323
pub fn compute(
2424
text: &str,
2525
para_level: Level,
26-
initial_classes: &[BidiClass],
26+
original_classes: &[BidiClass],
2727
levels: &mut [Level],
2828
processing_classes: &mut [BidiClass],
2929
) {
30-
assert!(text.len() == initial_classes.len());
30+
assert!(text.len() == original_classes.len());
3131

3232
// http://www.unicode.org/reports/tr9/#X1
3333
let mut stack = DirectionalStatusStack::new();
@@ -38,14 +38,14 @@ pub fn compute(
3838
let mut valid_isolate_count = 0u32;
3939

4040
for (i, c) in text.char_indices() {
41-
match initial_classes[i] {
41+
match original_classes[i] {
4242

4343
// Rules X2-X5c
4444
RLE | LRE | RLO | LRO | RLI | LRI | FSI => {
4545
let last_level = stack.last().level;
4646

4747
// X5a-X5c: Isolate initiators get the level of the last entry on the stack.
48-
let is_isolate = matches!(initial_classes[i], RLI | LRI | FSI);
48+
let is_isolate = matches!(original_classes[i], RLI | LRI | FSI);
4949
if is_isolate {
5050
levels[i] = last_level;
5151
match stack.last().status {
@@ -55,7 +55,7 @@ pub fn compute(
5555
}
5656
}
5757

58-
let new_level = if is_rtl(initial_classes[i]) {
58+
let new_level = if is_rtl(original_classes[i]) {
5959
last_level.new_explicit_next_rtl()
6060
} else {
6161
last_level.new_explicit_next_ltr()
@@ -64,7 +64,7 @@ pub fn compute(
6464
overflow_embedding_count == 0 {
6565
let new_level = new_level.unwrap();
6666
stack.push(
67-
new_level, match initial_classes[i] {
67+
new_level, match original_classes[i] {
6868
RLO => OverrideStatus::RTL,
6969
LRO => OverrideStatus::LTR,
7070
RLI | LRI | FSI => OverrideStatus::Isolate,

src/format_chars.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ pub const PDI: char = '\u{2069}';
3434
pub const LRE: char = '\u{202A}';
3535
/// RIGHT-TO-LEFT EMBEDDING
3636
pub const RLE: char = '\u{202B}';
37+
/// POP DIRECTIONAL FORMATTING
38+
pub const PDF: char = '\u{202C}';
3739
/// LEFT-TO-RIGHT OVERRIDE
3840
pub const LRO: char = '\u{202D}';
3941
/// RIGHT-TO-LEFT OVERRIDE
4042
pub const RLO: char = '\u{202E}';
41-
/// POP DIRECTIONAL FORMATTING
42-
pub const PDF: char = '\u{202C}';

src/implicit.rs

+18-9
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, processing_classes: &mut [B
2626
// by an "earlier" rule. We should either split this into separate passes, or preserve
2727
// extra state so each rule can see the correct previous class.
2828

29+
// FIXME: Also, this could be the cause of increased failure for using longer-UTF-8 chars in
30+
// conformance tests, like BidiTest:69635 (AL ET EN)
31+
2932
let mut prev_class = sequence.sos;
3033
let mut last_strong_is_al = false;
3134
let mut et_run_indices = Vec::new(); // for W5
@@ -139,22 +142,17 @@ pub fn resolve_neutral(
139142
levels: &[Level],
140143
processing_classes: &mut [BidiClass],
141144
) {
145+
let e: BidiClass = levels[sequence.runs[0].start].bidi_class();
142146
let mut indices = sequence.runs.iter().flat_map(Clone::clone);
143147
let mut prev_class = sequence.sos;
144148

145-
// Neutral or Isolate formatting characters (NI).
146-
// http://www.unicode.org/reports/tr9/#NI
147-
fn ni(class: BidiClass) -> bool {
148-
matches!(class, B | S | WS | ON | FSI | LRI | RLI | PDI)
149-
}
150-
151149
while let Some(mut i) = indices.next() {
152150
// N0. Process bracket pairs.
153151
// TODO
154152

155153
// Process sequences of NI characters.
156154
let mut ni_run = Vec::new();
157-
if ni(processing_classes[i]) {
155+
if is_NI(processing_classes[i]) {
158156
// Consume a run of consecutive NI characters.
159157
ni_run.push(i);
160158
let mut next_class;
@@ -166,7 +164,7 @@ pub fn resolve_neutral(
166164
continue;
167165
}
168166
next_class = processing_classes[j];
169-
if ni(next_class) {
167+
if is_NI(next_class) {
170168
ni_run.push(i);
171169
} else {
172170
break;
@@ -180,11 +178,14 @@ pub fn resolve_neutral(
180178
}
181179

182180
// N1-N2.
181+
//
182+
// http://www.unicode.org/reports/tr9/#N1
183+
// http://www.unicode.org/reports/tr9/#N2
183184
let new_class = match (prev_class, next_class) {
184185
(L, L) => L,
185186
(R, R) | (R, AN) | (R, EN) | (AN, R) | (AN, AN) | (AN, EN) | (EN, R) |
186187
(EN, AN) | (EN, EN) => R,
187-
(_, _) => levels[i].bidi_class(),
188+
(_, _) => e,
188189
};
189190
for j in &ni_run {
190191
processing_classes[*j] = new_class;
@@ -218,3 +219,11 @@ pub fn resolve_levels(original_classes: &[BidiClass], levels: &mut [Level]) -> L
218219

219220
max_level
220221
}
222+
223+
/// Neutral or Isolate formatting character (B, S, WS, ON, FSI, LRI, RLI, PDI)
224+
///
225+
/// http://www.unicode.org/reports/tr9/#NI
226+
#[allow(non_snake_case)]
227+
fn is_NI(class: BidiClass) -> bool {
228+
matches!(class, B | S | WS | ON | FSI | LRI | RLI | PDI)
229+
}

src/lib.rs

+91-3
Original file line numberDiff line numberDiff line change
@@ -291,16 +291,31 @@ impl<'text> BidiInfo<'text> {
291291
}
292292
}
293293

294-
/// Re-order a line based on resolved levels and return only the embedding levels.
294+
/// Re-order a line based on resolved levels and return only the embedding levels, one `Level`
295+
/// per *byte*.
295296
pub fn reordered_levels(&self, para: &ParagraphInfo, line: Range<usize>) -> Vec<Level> {
296297
let (levels, _) = self.visual_runs(para, line.clone());
297298
levels
298299
}
299300

301+
/// Re-order a line based on resolved levels and return only the embedding levels, one `Level`
302+
/// per *character*.
303+
pub fn reordered_levels_per_char(
304+
&self,
305+
para: &ParagraphInfo,
306+
line: Range<usize>,
307+
) -> Vec<Level> {
308+
let levels = self.reordered_levels(para, line);
309+
self.text.char_indices().map(|(i, _)| levels[i]).collect()
310+
}
311+
312+
300313
/// Re-order a line based on resolved levels and return the line in display order.
301314
pub fn reorder_line(&self, para: &ParagraphInfo, line: Range<usize>) -> Cow<'text, str> {
302315
let (levels, runs) = self.visual_runs(para, line.clone());
303-
if runs.len() == 1 && levels[runs[0].start].is_ltr() {
316+
317+
// If all isolating run sequences are LTR, no reordering is needed
318+
if runs.iter().all(|run| levels[run.start].is_ltr()) {
304319
return self.text[line.clone()].into();
305320
}
306321

@@ -333,10 +348,12 @@ impl<'text> BidiInfo<'text> {
333348
// Reset some whitespace chars to paragraph level.
334349
// http://www.unicode.org/reports/tr9/#L1
335350
let line_str: &str = &self.text[line.clone()];
336-
let mut reset_from: Option<usize> = None;
351+
let mut reset_from: Option<usize> = Some(0);
337352
let mut reset_to: Option<usize> = None;
338353
for (i, c) in line_str.char_indices() {
339354
match self.original_classes[i] {
355+
// Ignored by X9
356+
RLE | LRE | RLO | LRO | PDF | BN => {}
340357
// Segment separator, Paragraph separator
341358
B | S => {
342359
assert!(reset_to == None);
@@ -635,6 +652,10 @@ mod tests {
635652
],
636653
}
637654
);
655+
656+
/// BidiTest:69635 (AL ET EN)
657+
let bidi_info = BidiInfo::new("\u{060B}\u{20CF}\u{06F9}", None);
658+
assert_eq!(bidi_info.original_classes, vec![AL, AL, ET, ET, ET, EN, EN]);
638659
}
639660

640661
#[test]
@@ -703,6 +724,18 @@ mod tests {
703724
// Numbers being weak LTR characters, cannot reorder strong RTL
704725
assert_eq!(reorder_paras("123 אבג"), vec!["גבא 123"]);
705726

727+
assert_eq!(reorder_paras("abc\u{202A}def"), vec!["abc\u{202A}def"]);
728+
729+
assert_eq!(
730+
reorder_paras("abc\u{202A}def\u{202C}ghi"),
731+
vec!["abc\u{202A}def\u{202C}ghi"]
732+
);
733+
734+
assert_eq!(
735+
reorder_paras("abc\u{2066}def\u{2069}ghi"),
736+
vec!["abc\u{2066}def\u{2069}ghi"]
737+
);
738+
706739
// Testing for RLE Character
707740
assert_eq!(
708741
reorder_paras("\u{202B}abc אבג\u{202C}"),
@@ -725,6 +758,7 @@ mod tests {
725758
reorder_paras("abc\u{2067}.-\u{2069}ghi"),
726759
vec!["abc\u{2067}-.\u{2069}ghi"]
727760
);
761+
728762
assert_eq!(
729763
reorder_paras("Hello, \u{2068}\u{202E}world\u{202C}\u{2069}!"),
730764
vec!["Hello, \u{2068}\u{202E}\u{202C}dlrow\u{2069}!"]
@@ -739,6 +773,60 @@ mod tests {
739773
vec!["ef].)gh&[דג(בא"]
740774
);
741775
}
776+
777+
fn reordered_levels_for_paras(text: &str) -> Vec<Vec<Level>> {
778+
let bidi_info = BidiInfo::new(text, None);
779+
bidi_info
780+
.paragraphs
781+
.iter()
782+
.map(|para| bidi_info.reordered_levels(para, para.range.clone()))
783+
.collect()
784+
}
785+
786+
fn reordered_levels_per_char_for_paras(text: &str) -> Vec<Vec<Level>> {
787+
let bidi_info = BidiInfo::new(text, None);
788+
bidi_info
789+
.paragraphs
790+
.iter()
791+
.map(|para| bidi_info.reordered_levels_per_char(para, para.range.clone()))
792+
.collect()
793+
}
794+
795+
#[test]
796+
fn test_reordered_levels() {
797+
798+
/// BidiTest:946 (LRI PDI)
799+
let text = "\u{2067}\u{2069}";
800+
assert_eq!(
801+
reordered_levels_for_paras(text),
802+
vec![Level::vec(&[0, 0, 0, 0, 0, 0])]
803+
);
804+
assert_eq!(
805+
reordered_levels_per_char_for_paras(text),
806+
vec![Level::vec(&[0, 0])]
807+
);
808+
809+
/* TODO
810+
/// BidiTest:69635 (AL ET EN)
811+
let text = "\u{060B}\u{20CF}\u{06F9}";
812+
assert_eq!(
813+
reordered_levels_for_paras(text),
814+
vec![Level::vec(&[1, 1, 1, 1, 1, 2, 2])]
815+
);
816+
assert_eq!(
817+
reordered_levels_per_char_for_paras(text),
818+
vec![Level::vec(&[1, 1, 2])]
819+
);
820+
*/
821+
822+
/* TODO
823+
// BidiTest:291284 (AN RLI PDF R)
824+
assert_eq!(
825+
reordered_levels_per_char_for_paras("\u{0605}\u{2067}\u{202C}\u{0590}"),
826+
vec![&["2", "0", "x", "1"]]
827+
);
828+
*/
829+
}
742830
}
743831

744832

0 commit comments

Comments
 (0)