Skip to content

Commit 326f159

Browse files
authored
Improve text spanner (#91)
1 parent d4dedf7 commit 326f159

12 files changed

+1486
-497
lines changed

crates/krilla/src/chunk_container.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ impl ChunkContainer {
173173
// TODO: Replace with `is_none_or` once MSRV allows to.
174174
let missing_title = match self.metadata.as_ref() {
175175
None => true,
176-
Some(m) => m.title.is_none()
176+
Some(m) => m.title.is_none(),
177177
};
178178

179179
if missing_title {

crates/krilla/src/content.rs

+51-27
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,15 @@ impl ContentBuilder {
464464

465465
// Separate into distinct glyph runs that either are encoded using actual text, or are
466466
// not.
467-
let spanned = TextSpanner::new(glyphs, text, paint_mode, font_container.clone());
467+
let spanned = TextSpanner::new(
468+
glyphs,
469+
text,
470+
sc.serialize_settings()
471+
.validator
472+
.requires_codepoint_mappings(),
473+
paint_mode,
474+
font_container.clone(),
475+
);
468476

469477
for fragment in spanned {
470478
if let Some(text) = fragment.actual_text() {
@@ -1094,6 +1102,7 @@ where
10941102
{
10951103
slice: &'a [T],
10961104
paint_mode: PaintMode<'a>,
1105+
forbid_invalid_codepoints: bool,
10971106
font_container: Rc<RefCell<FontContainer>>,
10981107
text: &'a str,
10991108
}
@@ -1105,12 +1114,14 @@ where
11051114
pub(crate) fn new(
11061115
slice: &'a [T],
11071116
text: &'a str,
1117+
forbid_invalid_codepoints: bool,
11081118
paint_mode: PaintMode<'a>,
11091119
font_container: Rc<RefCell<FontContainer>>,
11101120
) -> Self {
11111121
Self {
11121122
slice,
11131123
paint_mode,
1124+
forbid_invalid_codepoints,
11141125
text,
11151126
font_container,
11161127
}
@@ -1128,6 +1139,8 @@ where
11281139
fn func<U>(
11291140
g: &U,
11301141
paint_mode: PaintMode,
1142+
previous_range: Option<Range<usize>>,
1143+
forbid_invalid_codepoints: bool,
11311144
mut font_container: RefMut<FontContainer>,
11321145
text: &str,
11331146
) -> (Range<usize>, bool)
@@ -1145,10 +1158,25 @@ where
11451158
let codepoints = pdf_font.get_codepoints(pdf_glyph);
11461159
// Check if the glyph has already been assigned codepoints that don't match the
11471160
// one we are seeing right now.
1148-
let incompatible_codepoint = codepoints.is_some() && codepoints != Some(text);
1149-
1150-
// Only set the codepoint if there isn't a previous one.
1151-
if !incompatible_codepoint {
1161+
let incompatible_codepoint = codepoints.is_some_and(|text| codepoints != Some(text));
1162+
1163+
// Only set the codepoint if there isn't a previous, different mapping.
1164+
//
1165+
// If we could set it, we only want to insert a codepoint if we are not already
1166+
// building a spanned run (which is the case if the previous range is the same).
1167+
// If we are building a spanned run, it means that the glyphs are part of the same
1168+
// cluster, in which case only the first glyph should be assigned the codepoint,
1169+
// while all other glyphs in the same cluster should not be assigned anything.
1170+
// Otherwise, when copying text from the PDF, we will get the same codepoint multiple
1171+
// times in viewers that don't support `ActualText`.
1172+
//
1173+
// However, in case we are for example exporting to PDF/UA, every glyph is required
1174+
// to have a valid codepoint mapping. So in this case, we still add the codepoints
1175+
// to each glyph in the cluster, this will result in worse copy-pasting in viewers
1176+
// that don't support `ActualText`.
1177+
if !incompatible_codepoint
1178+
&& (previous_range != Some(range.clone()) || forbid_invalid_codepoints)
1179+
{
11521180
pdf_font.set_codepoints(pdf_glyph, text.to_string());
11531181
}
11541182

@@ -1165,6 +1193,8 @@ where
11651193
let (first_range, first_incompatible) = func(
11661194
iter.next()?,
11671195
self.paint_mode,
1196+
None,
1197+
self.forbid_invalid_codepoints,
11681198
self.font_container.borrow_mut(),
11691199
self.text,
11701200
);
@@ -1175,6 +1205,8 @@ where
11751205
let (next_range, next_incompatible) = func(
11761206
next,
11771207
self.paint_mode,
1208+
Some(last_range.clone()),
1209+
self.forbid_invalid_codepoints,
11781210
self.font_container.borrow_mut(),
11791211
self.text,
11801212
);
@@ -1183,33 +1215,24 @@ where
11831215
// In this case, we just started and we are looking at the first two glyphs.
11841216
// This decides whether the current run will be spanned, or not.
11851217
None => {
1186-
// The first glyph is incompatible, so we definitely need actual text.
1187-
if first_incompatible {
1218+
// The two glyphs are in the same range, so we definitely want this run
1219+
// to be spanned, and also want to include both glyphs in that run.
1220+
if last_range == next_range {
11881221
use_span = Some(true);
1189-
1190-
// If the range of the next one is the same, it means they are
1191-
// part of the same cluster, meaning that we need to include it
1192-
// in the actual text. If not, we abort and only wrap the first
1193-
// glyph in actual text.
1194-
if last_range != next_range {
1222+
} else {
1223+
// Else, whether we use a span depends on whether the first glyph
1224+
// is incompatible.
1225+
use_span = Some(first_incompatible);
1226+
1227+
// If either the first glyph or the second glyph are incompatible, they
1228+
// need to be in separate runs, since they are not part of the same cluster.
1229+
if first_incompatible || next_incompatible {
11951230
break;
11961231
}
1197-
}
11981232

1199-
// If the next is incompatible but not part of the current cluster,
1200-
// then it will need a dedicated spanned range, and
1201-
// we can't include it in the current text span. So we abort and
1202-
// create a spanned element with just the first glyph.
1203-
if next_incompatible && last_range != next_range {
1204-
break;
1233+
// If none are incompatible, then `use_span` is false, and we can also
1234+
// include the next glyph in that unspanned run.
12051235
}
1206-
1207-
// If they have the same range, they are part of the same cluster,
1208-
// and thus we started a spanned range with actual text.
1209-
//
1210-
// Otherwise, they are part of a different cluster, and we
1211-
// start a spanned range with no actual text (common case).
1212-
use_span = Some(last_range == next_range);
12131236
}
12141237
// We are currently building a spanned range, and all glyphs
12151238
// are part of the same cluster.
@@ -1251,6 +1274,7 @@ where
12511274
true => TextSpan::Spanned(head, &self.text[first_range]),
12521275
false => TextSpan::Unspanned(head),
12531276
};
1277+
12541278
Some(fragment)
12551279
}
12561280
}

crates/krilla/src/surface.rs

+16-11
Original file line numberDiff line numberDiff line change
@@ -647,6 +647,7 @@ fn naive_shape(
647647
mod tests {
648648
use crate::font::Font;
649649
use crate::mask::MaskType;
650+
use crate::page::Page;
650651
use crate::paint::{LinearGradient, Paint, SpreadMethod};
651652
use crate::path::Fill;
652653
use crate::surface::Surface;
@@ -811,8 +812,9 @@ mod tests {
811812
);
812813
}
813814

814-
#[snapshot(stream)]
815-
fn stream_complex_text(surface: &mut Surface) {
815+
#[snapshot(single_page)]
816+
fn complex_text(page: &mut Page) {
817+
let mut surface = page.surface();
816818
surface.fill_text(
817819
Point::from_xy(0.0, 50.0),
818820
Fill::default(),
@@ -825,41 +827,44 @@ mod tests {
825827
);
826828
}
827829

828-
#[snapshot(stream)]
829-
fn stream_complex_text_2(surface: &mut Surface) {
830+
#[snapshot(single_page)]
831+
fn complex_text_2(page: &mut Page) {
832+
let mut surface = page.surface();
830833
surface.fill_text(
831834
Point::from_xy(0.0, 50.0),
832835
Fill::default(),
833836
Font::new(NOTO_SANS_DEVANAGARI.clone(), 0, true).unwrap(),
834837
16.0,
835838
&[],
836-
"यु॒धा नर॑ ऋ॒ष्वा ",
839+
"यु॒धा नर॑ ऋ॒ष्वा",
837840
false,
838841
TextDirection::Auto,
839842
);
840843
}
841844

842-
#[snapshot(stream)]
843-
fn stream_complex_text_3(surface: &mut Surface) {
845+
#[snapshot(single_page)]
846+
fn complex_text_3(page: &mut Page) {
847+
let mut surface = page.surface();
844848
surface.fill_text(
845849
Point::from_xy(0.0, 50.0),
846850
Fill::default(),
847851
Font::new(NOTO_SANS_DEVANAGARI.clone(), 0, true).unwrap(),
848-
16.0,
852+
12.0,
849853
&[],
850854
"आ रु॒क्मैरा यु॒धा नर॑ ऋ॒ष्वा ऋ॒ष्टीर॑सृक्षत ।",
851855
false,
852856
TextDirection::Auto,
853857
);
854858
}
855859

856-
#[snapshot(stream)]
857-
fn stream_complex_text_4(surface: &mut Surface) {
860+
#[snapshot(single_page)]
861+
fn complex_text_4(page: &mut Page) {
862+
let mut surface = page.surface();
858863
surface.fill_text(
859864
Point::from_xy(0.0, 50.0),
860865
Fill::default(),
861866
Font::new(NOTO_SANS_DEVANAGARI.clone(), 0, true).unwrap(),
862-
16.0,
867+
10.0,
863868
&[],
864869
"अन्वे॑नाँ॒ अह॑ वि॒द्युतो॑ म॒रुतो॒ जज्झ॑तीरव भनर॑र्त॒ त्मना॑ दि॒वः ॥",
865870
false,

crates/krilla/src/validation/mod.rs

+22-10
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,9 @@ impl Validator {
255255
ValidationError::ContainsPostScript => true,
256256
ValidationError::MissingCMYKProfile => true,
257257
ValidationError::ContainsNotDefGlyph => false,
258-
ValidationError::InvalidCodepointMapping(_, _) => false,
258+
ValidationError::InvalidCodepointMapping(_, _) => {
259+
self.requires_codepoint_mappings()
260+
}
259261
ValidationError::UnicodePrivateArea(_, _) => false,
260262
ValidationError::NoDocumentLanguage => *self == Validator::A1_A,
261263
ValidationError::NoDocumentTitle => false,
@@ -276,11 +278,10 @@ impl Validator {
276278
ValidationError::ContainsPostScript => true,
277279
ValidationError::MissingCMYKProfile => true,
278280
ValidationError::ContainsNotDefGlyph => true,
279-
// Only applies for PDF/A2-U and PDF/A2-A
280-
ValidationError::InvalidCodepointMapping(_, _) => *self != Validator::A2_B,
281-
// Only applies to PDF/A2-A
281+
ValidationError::InvalidCodepointMapping(_, _) => {
282+
self.requires_codepoint_mappings()
283+
}
282284
ValidationError::UnicodePrivateArea(_, _) => *self == Validator::A2_A,
283-
// Only applies to PDF/A2-A
284285
ValidationError::NoDocumentLanguage => *self == Validator::A2_A,
285286
ValidationError::NoDocumentTitle => false,
286287
ValidationError::MissingAltText => false,
@@ -300,11 +301,10 @@ impl Validator {
300301
ValidationError::ContainsPostScript => true,
301302
ValidationError::MissingCMYKProfile => true,
302303
ValidationError::ContainsNotDefGlyph => true,
303-
// Only applies for PDF/A3-U and PDF/A3-A
304-
ValidationError::InvalidCodepointMapping(_, _) => *self != Validator::A3_B,
305-
// Only applies to PDF/A3-A
304+
ValidationError::InvalidCodepointMapping(_, _) => {
305+
self.requires_codepoint_mappings()
306+
}
306307
ValidationError::UnicodePrivateArea(_, _) => *self == Validator::A3_A,
307-
// Only applies to PDF/A3-A
308308
ValidationError::NoDocumentLanguage => *self == Validator::A3_A,
309309
ValidationError::NoDocumentTitle => false,
310310
ValidationError::MissingAltText => false,
@@ -324,7 +324,9 @@ impl Validator {
324324
ValidationError::ContainsPostScript => false,
325325
ValidationError::MissingCMYKProfile => false,
326326
ValidationError::ContainsNotDefGlyph => true,
327-
ValidationError::InvalidCodepointMapping(_, _) => true,
327+
ValidationError::InvalidCodepointMapping(_, _) => {
328+
self.requires_codepoint_mappings()
329+
}
328330
ValidationError::UnicodePrivateArea(_, _) => false,
329331
ValidationError::NoDocumentLanguage => false,
330332
ValidationError::NoDocumentTitle => true,
@@ -424,6 +426,16 @@ impl Validator {
424426
}
425427
}
426428

429+
pub(crate) fn requires_codepoint_mappings(&self) -> bool {
430+
match self {
431+
Validator::None => false,
432+
Validator::A1_A | Validator::A1_B => false,
433+
Validator::A2_A | Validator::A2_B | Validator::A2_U => *self != Validator::A2_B,
434+
Validator::A3_A | Validator::A3_B | Validator::A3_U => *self != Validator::A3_B,
435+
Validator::UA1 => true,
436+
}
437+
}
438+
427439
pub(crate) fn requires_display_doc_title(&self) -> bool {
428440
match self {
429441
Validator::None => false,

0 commit comments

Comments
 (0)