Skip to content

Commit b482964

Browse files
authored
Merge pull request #57 from kba/pdf-xml-parsing
2 parents edfb38e + fb994c3 commit b482964

1 file changed

Lines changed: 6 additions & 19 deletions

File tree

hocr-pdf

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ from PIL import Image
2929
from reportlab.pdfgen.canvas import Canvas
3030
from reportlab.pdfbase import pdfmetrics
3131
from reportlab.pdfbase.ttfonts import TTFont
32-
from xml.etree.ElementTree import ElementTree, ParseError
32+
from lxml import etree, html
3333

3434
class StdoutWrapper:
3535
"""
@@ -70,31 +70,18 @@ def add_text_layer(pdf, image, height, dpi):
7070
p1 = re.compile('bbox((\s+\d+){4})')
7171
p2 = re.compile('baseline((\s+[\d\.\-]+){2})')
7272
hocrfile = os.path.splitext(image)[0] + ".hocr"
73-
hocr = ElementTree()
74-
hocr.parse(hocrfile)
75-
for line in hocr.findall(".//{http://www.w3.org/1999/xhtml}span"):
76-
if line.attrib['class'] != 'ocr_line':
77-
continue
73+
hocr = etree.parse(hocrfile, html.XHTMLParser())
74+
for line in hocr.xpath('//*[@class="ocr_line"]'):
7875
linebox = p1.search(line.attrib['title']).group(1).split()
7976
try:
8077
baseline = p2.search(line.attrib['title']).group(1).split()
8178
except AttributeError:
8279
baseline = [ 0, 0 ]
8380
linebox = [float(i) for i in linebox]
8481
baseline = [float(i) for i in baseline]
85-
for word in line:
86-
if word.attrib['class'] != 'ocrx_word':
87-
continue
88-
if word.text is not None:
89-
rawtext = word.text.strip()
90-
else:
91-
try:
92-
innerword = word[0]
93-
if innerword.text is not None:
94-
rawtext = innerword.text.strip()
95-
else:
96-
continue
97-
except:
82+
for word in line.xpath('.//*[@class="ocrx_word"]'):
83+
rawtext = word.text_content().strip()
84+
if rawtext == '':
9885
continue
9986
font_width = pdf.stringWidth(rawtext, 'invisible', 8)
10087
if font_width <= 0:

0 commit comments

Comments
 (0)