@@ -464,7 +464,15 @@ impl ContentBuilder {
464
464
465
465
// Separate into distinct glyph runs that either are encoded using actual text, or are
466
466
// not.
467
- let spanned = TextSpanner :: new ( glyphs, text, paint_mode, font_container. clone ( ) ) ;
467
+ let spanned = TextSpanner :: new (
468
+ glyphs,
469
+ text,
470
+ sc. serialize_settings ( )
471
+ . validator
472
+ . requires_codepoint_mappings ( ) ,
473
+ paint_mode,
474
+ font_container. clone ( ) ,
475
+ ) ;
468
476
469
477
for fragment in spanned {
470
478
if let Some ( text) = fragment. actual_text ( ) {
@@ -1094,6 +1102,7 @@ where
1094
1102
{
1095
1103
slice : & ' a [ T ] ,
1096
1104
paint_mode : PaintMode < ' a > ,
1105
+ forbid_invalid_codepoints : bool ,
1097
1106
font_container : Rc < RefCell < FontContainer > > ,
1098
1107
text : & ' a str ,
1099
1108
}
@@ -1105,12 +1114,14 @@ where
1105
1114
pub ( crate ) fn new (
1106
1115
slice : & ' a [ T ] ,
1107
1116
text : & ' a str ,
1117
+ forbid_invalid_codepoints : bool ,
1108
1118
paint_mode : PaintMode < ' a > ,
1109
1119
font_container : Rc < RefCell < FontContainer > > ,
1110
1120
) -> Self {
1111
1121
Self {
1112
1122
slice,
1113
1123
paint_mode,
1124
+ forbid_invalid_codepoints,
1114
1125
text,
1115
1126
font_container,
1116
1127
}
@@ -1128,6 +1139,8 @@ where
1128
1139
fn func < U > (
1129
1140
g : & U ,
1130
1141
paint_mode : PaintMode ,
1142
+ previous_range : Option < Range < usize > > ,
1143
+ forbid_invalid_codepoints : bool ,
1131
1144
mut font_container : RefMut < FontContainer > ,
1132
1145
text : & str ,
1133
1146
) -> ( Range < usize > , bool )
@@ -1145,10 +1158,25 @@ where
1145
1158
let codepoints = pdf_font. get_codepoints ( pdf_glyph) ;
1146
1159
// Check if the glyph has already been assigned codepoints that don't match the
1147
1160
// one we are seeing right now.
1148
- let incompatible_codepoint = codepoints. is_some ( ) && codepoints != Some ( text) ;
1149
-
1150
- // Only set the codepoint if there isn't a previous one.
1151
- if !incompatible_codepoint {
1161
+ let incompatible_codepoint = codepoints. is_some_and ( |text| codepoints != Some ( text) ) ;
1162
+
1163
+ // Only set the codepoint if there isn't a previous, different mapping.
1164
+ //
1165
+ // If we could set it, we only want to insert a codepoint if we are not already
1166
+ // building a spanned run (which is the case if the previous range is the same).
1167
+ // If we are building a spanned run, it means that the glyphs are part of the same
1168
+ // cluster, in which case only the first glyph should be assigned the codepoint,
1169
+ // while all other glyphs in the same cluster should not be assigned anything.
1170
+ // Otherwise, when copying text from the PDF, we will get the same codepoint multiple
1171
+ // times in viewers that don't support `ActualText`.
1172
+ //
1173
+ // However, in case we are for example exporting to PDF/UA, every glyph is required
1174
+ // to have a valid codepoint mapping. So in this case, we still add the codepoints
1175
+ // to each glyph in the cluster, this will result in worse copy-pasting in viewers
1176
+ // that don't support `ActualText`.
1177
+ if !incompatible_codepoint
1178
+ && ( previous_range != Some ( range. clone ( ) ) || forbid_invalid_codepoints)
1179
+ {
1152
1180
pdf_font. set_codepoints ( pdf_glyph, text. to_string ( ) ) ;
1153
1181
}
1154
1182
@@ -1165,6 +1193,8 @@ where
1165
1193
let ( first_range, first_incompatible) = func (
1166
1194
iter. next ( ) ?,
1167
1195
self . paint_mode ,
1196
+ None ,
1197
+ self . forbid_invalid_codepoints ,
1168
1198
self . font_container . borrow_mut ( ) ,
1169
1199
self . text ,
1170
1200
) ;
@@ -1175,6 +1205,8 @@ where
1175
1205
let ( next_range, next_incompatible) = func (
1176
1206
next,
1177
1207
self . paint_mode ,
1208
+ Some ( last_range. clone ( ) ) ,
1209
+ self . forbid_invalid_codepoints ,
1178
1210
self . font_container . borrow_mut ( ) ,
1179
1211
self . text ,
1180
1212
) ;
@@ -1183,33 +1215,24 @@ where
1183
1215
// In this case, we just started and we are looking at the first two glyphs.
1184
1216
// This decides whether the current run will be spanned, or not.
1185
1217
None => {
1186
- // The first glyph is incompatible, so we definitely need actual text.
1187
- if first_incompatible {
1218
+ // The two glyphs are in the same range, so we definitely want this run
1219
+ // to be spanned, and also want to include both glyphs in that run.
1220
+ if last_range == next_range {
1188
1221
use_span = Some ( true ) ;
1189
-
1190
- // If the range of the next one is the same, it means they are
1191
- // part of the same cluster, meaning that we need to include it
1192
- // in the actual text. If not, we abort and only wrap the first
1193
- // glyph in actual text.
1194
- if last_range != next_range {
1222
+ } else {
1223
+ // Else, whether we use a span depends on whether the first glyph
1224
+ // is incompatible.
1225
+ use_span = Some ( first_incompatible) ;
1226
+
1227
+ // If either the first glyph or the second glyph are incompatible, they
1228
+ // need to be in separate runs, since they are not part of the same cluster.
1229
+ if first_incompatible || next_incompatible {
1195
1230
break ;
1196
1231
}
1197
- }
1198
1232
1199
- // If the next is incompatible but not part of the current cluster,
1200
- // then it will need a dedicated spanned range, and
1201
- // we can't include it in the current text span. So we abort and
1202
- // create a spanned element with just the first glyph.
1203
- if next_incompatible && last_range != next_range {
1204
- break ;
1233
+ // If none are incompatible, then `use_span` is false, and we can also
1234
+ // include the next glyph in that unspanned run.
1205
1235
}
1206
-
1207
- // If they have the same range, they are part of the same cluster,
1208
- // and thus we started a spanned range with actual text.
1209
- //
1210
- // Otherwise, they are part of a different cluster, and we
1211
- // start a spanned range with no actual text (common case).
1212
- use_span = Some ( last_range == next_range) ;
1213
1236
}
1214
1237
// We are currently building a spanned range, and all glyphs
1215
1238
// are part of the same cluster.
@@ -1251,6 +1274,7 @@ where
1251
1274
true => TextSpan :: Spanned ( head, & self . text [ first_range] ) ,
1252
1275
false => TextSpan :: Unspanned ( head) ,
1253
1276
} ;
1277
+
1254
1278
Some ( fragment)
1255
1279
}
1256
1280
}
0 commit comments