Skip to content

Commit 551f1b9

Browse files
committed
Merge branch pull request coolwanglu#493 into merge_all
2 parents 3bbb286 + 9dbd504 commit 551f1b9

File tree

9 files changed

+112
-25
lines changed

9 files changed

+112
-25
lines changed

3rdparty/poppler/git/CairoFontEngine.cc

+6-2
Original file line numberDiff line numberDiff line change
@@ -377,14 +377,18 @@ _ft_new_face (FT_Library lib,
377377

378378
CairoFreeTypeFont::CairoFreeTypeFont(Ref ref,
379379
cairo_font_face_t *cairo_font_face,
380+
FT_Face ft_face,
380381
int *codeToGID,
381382
Guint codeToGIDLen,
382383
GBool substitute) : CairoFont(ref,
383384
cairo_font_face,
384385
codeToGID,
385386
codeToGIDLen,
386387
substitute,
387-
gTrue) { }
388+
gTrue),
389+
// Caution: this field is added by pdf2htmlEX to determine whitespace. Please merge during update.
390+
ft_face(ft_face)
391+
{ }
388392

389393
CairoFreeTypeFont::~CairoFreeTypeFont() { }
390394

@@ -547,7 +551,7 @@ CairoFreeTypeFont *CairoFreeTypeFont::create(GfxFont *gfxFont, XRef *xref,
547551

548552
delete fontLoc;
549553
return new CairoFreeTypeFont(ref,
550-
font_face,
554+
font_face, face,
551555
codeToGID, codeToGIDLen,
552556
substitute);
553557

3rdparty/poppler/git/CairoFontEngine.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,12 @@ class CairoFreeTypeFont : public CairoFont {
7575
public:
7676
static CairoFreeTypeFont *create(GfxFont *gfxFont, XRef *xref, FT_Library lib, GBool useCIDs);
7777
virtual ~CairoFreeTypeFont();
78-
78+
// Caution: this function is added by pdf2htmlEX to determine whitespace. Please merge during update.
79+
FT_Face get_ft_face() { return ft_face; }
7980
private:
80-
CairoFreeTypeFont(Ref ref, cairo_font_face_t *cairo_font_face,
81+
CairoFreeTypeFont(Ref ref, cairo_font_face_t *cairo_font_face, FT_Face ft_face,
8182
int *codeToGID, Guint codeToGIDLen, GBool substitute);
83+
FT_Face ft_face;
8284
};
8385

8486
//------------------------------------------------------------------------

src/HTMLRenderer/HTMLRenderer.h

+12
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
#include <fstream>
1313
#include <memory>
1414

15+
#include <ft2build.h>
16+
#include FT_FREETYPE_H
1517
#include <OutputDev.h>
1618
#include <GfxState.h>
1719
#include <Stream.h>
@@ -42,6 +44,7 @@
4244
#include "util/const.h"
4345
#include "util/misc.h"
4446

47+
class CairoFontEngine;
4548

4649
namespace pdf2htmlEX {
4750

@@ -217,6 +220,10 @@ struct HTMLRenderer : OutputDev
217220
// make sure the current HTML style consistent with PDF
218221
void prepare_text_line(GfxState * state);
219222

223+
// Check whether this char has a non-empty glyph in this font. If not sure, return true.
224+
// A char has an empty glyph or no glyph is usually a whitespace.
225+
bool has_glyph(CharCode code, GfxFont* font);
226+
220227
////////////////////////////////////////////////////
221228
// PDF stuffs
222229
////////////////////////////////////////////////////
@@ -341,6 +348,11 @@ struct HTMLRenderer : OutputDev
341348

342349
CoveredTextDetector covered_text_detector;
343350
DrawingTracer tracer;
351+
352+
#if ENABLE_SVG
353+
FT_Library ft_lib;
354+
std::unique_ptr<CairoFontEngine> font_engine;
355+
#endif
344356
};
345357

346358
} //namespace pdf2htmlEX

src/HTMLRenderer/font.cc

+26
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include "CairoFontEngine.h"
3939
#include "CairoOutputDev.h"
4040
#include <Gfx.h>
41+
#include FT_OUTLINE_H
4142
#endif
4243

4344
namespace pdf2htmlEX {
@@ -1086,4 +1087,29 @@ void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, cons
10861087
f_css.fs << "}" << endl;
10871088
}
10881089

1090+
bool HTMLRenderer::has_glyph(CharCode code, GfxFont* font)
1091+
{
1092+
#if ENABLE_SVG
1093+
if (font->getType() == fontType3)
1094+
return true;
1095+
CairoFreeTypeFont* ftfont = (CairoFreeTypeFont*)font_engine->getFont(font, cur_doc, false, xref);
1096+
if (ftfont == nullptr)
1097+
return false;
1098+
FT_Face face = ftfont->get_ft_face();
1099+
if (face == nullptr)
1100+
return false;
1101+
auto gid = ftfont->getGlyph(code, nullptr, 0);
1102+
// gid == 0 means no glyph
1103+
if (gid == 0)
1104+
return false;
1105+
if (FT_Load_Glyph(face, gid, FT_LOAD_NO_SCALE))
1106+
return false;
1107+
FT_GlyphSlot slot = face->glyph;
1108+
// n_contours == 0 means an empty glyph
1109+
if (slot->format == FT_GLYPH_FORMAT_OUTLINE && slot->outline.n_contours == 0)
1110+
return false;
1111+
#endif
1112+
return true;
1113+
}
1114+
10891115
} //namespace pdf2htmlEX

src/HTMLRenderer/general.cc

+12
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@
2929
#include "util/css_const.h"
3030
#include "util/encoding.h"
3131

32+
#if ENABLE_SVG
33+
#include "CairoFontEngine.h"
34+
#endif
35+
3236
namespace pdf2htmlEX {
3337

3438
using std::fixed;
@@ -86,11 +90,19 @@ HTMLRenderer::HTMLRenderer(const Param & param)
8690
[this](double * box, bool partial) { covered_text_detector.add_char_bbox_clipped(box, partial); };
8791
tracer.on_non_char_drawn =
8892
[this](double * box) { covered_text_detector.add_non_char_bbox(box); };
93+
94+
#if ENABLE_SVG
95+
FT_Init_FreeType(&ft_lib);
96+
font_engine = std::unique_ptr<CairoFontEngine>(new CairoFontEngine(ft_lib));
97+
#endif
8998
}
9099

91100
HTMLRenderer::~HTMLRenderer()
92101
{
93102
ffw_finalize();
103+
#if ENABLE_SVG
104+
FT_Done_FreeType(ft_lib);
105+
#endif
94106
}
95107

96108
void HTMLRenderer::process(PDFDoc *doc)

src/HTMLRenderer/text.cc

+27-15
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
7474
while (len > 0)
7575
{
7676
auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy);
77-
HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)\n", (wchar_t)u[0], u[0]));
77+
HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)%s\n", (wchar_t)u[0], u[0], has_glyph(code, font) ? "":" no glyph"));
7878

7979
if(!(equal(ox, 0) && equal(oy, 0)))
8080
{
@@ -113,24 +113,36 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
113113
}
114114
else
115115
{
116-
Unicode uu;
117-
if(cur_text_state.font_info->use_tounicode)
116+
if (uLen == 1 && is_illegal_unicode(u[0]) && !has_glyph(code, font))
118117
{
119-
uu = check_unicode(u, uLen, code, font);
118+
// Convert illegal html unicode to a whitespace, if it has no glyph.
119+
// Add a zero-width space AFTER the offset to make sure words are
120+
// delimited, and make sure the ZWSP can be optimized out if the
121+
// offset is represented by a space (see HTMLTextLine::dump_unicode).
122+
html_text_page.get_cur_line()->append_offset(ddx * draw_text_scale);
123+
html_text_page.get_cur_line()->append_unicodes(&zero_width_space, 1, 0);
120124
}
121125
else
122126
{
123-
uu = unicode_from_font(code, font);
124-
}
125-
html_text_page.get_cur_line()->append_unicodes(&uu, 1, ddx);
126-
/*
127-
* In PDF, word_space is appended if (n == 1 and *p = ' ')
128-
* but in HTML, word_space is appended if (uu == ' ')
129-
*/
130-
int space_count = (is_space ? 1 : 0) - ((uu == ' ') ? 1 : 0);
131-
if(space_count != 0)
132-
{
133-
html_text_page.get_cur_line()->append_offset(cur_word_space * draw_text_scale * space_count);
127+
Unicode uu;
128+
if(cur_text_state.font_info->use_tounicode)
129+
{
130+
uu = check_unicode(u, uLen, code, font);
131+
}
132+
else
133+
{
134+
uu = unicode_from_font(code, font);
135+
}
136+
html_text_page.get_cur_line()->append_unicodes(&uu, 1, ddx);
137+
/*
138+
* In PDF, word_space is appended if (n == 1 and *p = ' ')
139+
* but in HTML, word_space is appended if (uu == ' ')
140+
*/
141+
int space_count = (is_space ? 1 : 0) - ((uu == ' ') ? 1 : 0);
142+
if(space_count != 0)
143+
{
144+
html_text_page.get_cur_line()->append_offset(cur_word_space * draw_text_scale * space_count);
145+
}
134146
}
135147
}
136148
}

src/HTMLTextLine.cc

+20-6
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
#include "util/encoding.h"
1515
#include "util/css_const.h"
16+
#include "util/unicode.h"
1617

1718
namespace pdf2htmlEX {
1819

@@ -32,6 +33,7 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para
3233
,clip_x1(0)
3334
,clip_y1(0)
3435
,width(0)
36+
,last_output_unicode(0)
3537
{ }
3638

3739
void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width)
@@ -88,16 +90,25 @@ void HTMLTextLine::dump_char(std::ostream & out, int pos)
8890
int c = text[pos];
8991
if (c > 0)
9092
{
91-
Unicode u = c;
92-
writeUnicodes(out, &u, 1);
93+
dump_unicode(out, c);
9394
}
9495
else if (c < 0)
9596
{
9697
auto dt = decomposed_text[- c - 1];
97-
writeUnicodes(out, &dt.front(), dt.size());
98+
for (auto it = dt.begin(), end = dt.end(); it != end; it++)
99+
dump_unicode(out, *it);
98100
}
99101
}
100102

103+
void HTMLTextLine::dump_unicode(std::ostream & out, Unicode u)
104+
{
105+
// ZWSP following space can be optimized out.
106+
if (u == zero_width_space && last_output_unicode == ' ')
107+
return;
108+
writeUnicodes(out, &u, 1);
109+
last_output_unicode = u;
110+
}
111+
101112
void HTMLTextLine::dump_chars(ostream & out, int begin, int len)
102113
{
103114
static const Color transparent(0, 0, 0, true);
@@ -162,6 +173,7 @@ void HTMLTextLine::dump_text(ostream & out)
162173
<< " " << CSS::BOTTOM_CN << all_manager.bottom.install(line_state.y - clip_y1)
163174
;
164175
// it will be closed by the first state
176+
last_output_unicode = 0;
165177
}
166178

167179
std::vector<State*> stack;
@@ -249,8 +261,7 @@ void HTMLTextLine::dump_text(ostream & out)
249261
double space_off = state_iter1->single_space_offset();
250262
if(std::abs(target - space_off) <= param.h_eps)
251263
{
252-
Unicode u = ' ';
253-
writeUnicodes(out, &u, 1);
264+
dump_unicode(out, ' ');
254265
actual_offset = space_off;
255266
done = true;
256267
}
@@ -269,7 +280,10 @@ void HTMLTextLine::dump_text(ostream & out)
269280
double threshold = state_iter1->em_size() * (param.space_threshold);
270281

271282
out << "<span class=\"" << CSS::WHITESPACE_CN
272-
<< ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
283+
<< ' ' << CSS::WHITESPACE_CN << wid << "\">";
284+
if (target > (threshold - EPS))
285+
dump_unicode(out, ' ');
286+
out << "</span>";
273287
}
274288
}
275289
}

src/HTMLTextLine.h

+3
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ class HTMLTextLine
107107
*/
108108
void dump_chars(std::ostream & out, int begin, int len);
109109
void dump_char(std::ostream & out, int pos);
110+
void dump_unicode(std::ostream & out, Unicode u);
110111

111112
const Param & param;
112113
AllStateManager & all_manager;
@@ -128,6 +129,8 @@ class HTMLTextLine
128129
*/
129130
std::vector<int> text;
130131
std::vector<std::vector<Unicode> > decomposed_text;
132+
133+
Unicode last_output_unicode; //last unicode written to html (chars in tags excluded)
131134
};
132135

133136
} // namespace pdf2htmlEX

src/util/unicode.h

+2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313

1414
namespace pdf2htmlEX {
1515

16+
const Unicode zero_width_space = 0x200B;
17+
1618
/**
1719
* Check whether a unicode character is illegal for the output HTML.
1820
* Unlike PDF readers, browsers has special treatments for such characters (normally treated as

0 commit comments

Comments
 (0)