13
13
14
14
#include " util/encoding.h"
15
15
#include " util/css_const.h"
16
+ #include " util/unicode.h"
16
17
17
18
namespace pdf2htmlEX {
18
19
@@ -32,6 +33,7 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para
32
33
,clip_x1(0 )
33
34
,clip_y1(0 )
34
35
,width(0 )
36
+ ,last_output_unicode(0 )
35
37
{ }
36
38
37
39
void HTMLTextLine::append_unicodes (const Unicode * u, int l, double width)
@@ -88,16 +90,25 @@ void HTMLTextLine::dump_char(std::ostream & out, int pos)
88
90
int c = text[pos];
89
91
if (c > 0 )
90
92
{
91
- Unicode u = c;
92
- writeUnicodes (out, &u, 1 );
93
+ dump_unicode (out, c);
93
94
}
94
95
else if (c < 0 )
95
96
{
96
97
auto dt = decomposed_text[- c - 1 ];
97
- writeUnicodes (out, &dt.front (), dt.size ());
98
+ for (auto it = dt.begin (), end = dt.end (); it != end; it++)
99
+ dump_unicode (out, *it);
98
100
}
99
101
}
100
102
103
+ void HTMLTextLine::dump_unicode (std::ostream & out, Unicode u)
104
+ {
105
+ // ZWSP following space can be optimized out.
106
+ if (u == zero_width_space && last_output_unicode == ' ' )
107
+ return ;
108
+ writeUnicodes (out, &u, 1 );
109
+ last_output_unicode = u;
110
+ }
111
+
101
112
void HTMLTextLine::dump_chars (ostream & out, int begin, int len)
102
113
{
103
114
static const Color transparent (0 , 0 , 0 , true );
@@ -162,6 +173,7 @@ void HTMLTextLine::dump_text(ostream & out)
162
173
<< " " << CSS::BOTTOM_CN << all_manager.bottom .install (line_state.y - clip_y1)
163
174
;
164
175
// it will be closed by the first state
176
+ last_output_unicode = 0 ;
165
177
}
166
178
167
179
std::vector<State*> stack;
@@ -249,8 +261,7 @@ void HTMLTextLine::dump_text(ostream & out)
249
261
double space_off = state_iter1->single_space_offset ();
250
262
if (std::abs (target - space_off) <= param.h_eps )
251
263
{
252
- Unicode u = ' ' ;
253
- writeUnicodes (out, &u, 1 );
264
+ dump_unicode (out, ' ' );
254
265
actual_offset = space_off;
255
266
done = true ;
256
267
}
@@ -269,7 +280,10 @@ void HTMLTextLine::dump_text(ostream & out)
269
280
double threshold = state_iter1->em_size () * (param.space_threshold );
270
281
271
282
out << " <span class=\" " << CSS::WHITESPACE_CN
272
- << ' ' << CSS::WHITESPACE_CN << wid << " \" >" << (target > (threshold - EPS) ? " " : " " ) << " </span>" ;
283
+ << ' ' << CSS::WHITESPACE_CN << wid << " \" >" ;
284
+ if (target > (threshold - EPS))
285
+ dump_unicode (out, ' ' );
286
+ out << " </span>" ;
273
287
}
274
288
}
275
289
}
0 commit comments