@@ -29,7 +29,7 @@ from PIL import Image
2929from reportlab .pdfgen .canvas import Canvas
3030from reportlab .pdfbase import pdfmetrics
3131from reportlab .pdfbase .ttfonts import TTFont
32- from xml . etree . ElementTree import ElementTree , ParseError
32+ from lxml import etree , html
3333
3434class StdoutWrapper :
3535 """
@@ -70,31 +70,18 @@ def add_text_layer(pdf, image, height, dpi):
7070 p1 = re .compile ('bbox((\s+\d+){4})' )
7171 p2 = re .compile ('baseline((\s+[\d\.\-]+){2})' )
7272 hocrfile = os .path .splitext (image )[0 ] + ".hocr"
73- hocr = ElementTree ()
74- hocr .parse (hocrfile )
75- for line in hocr .findall (".//{http://www.w3.org/1999/xhtml}span" ):
76- if line .attrib ['class' ] != 'ocr_line' :
77- continue
73+ hocr = etree .parse (hocrfile , html .XHTMLParser ())
74+ for line in hocr .xpath ('//*[@class="ocr_line"]' ):
7875 linebox = p1 .search (line .attrib ['title' ]).group (1 ).split ()
7976 try :
8077 baseline = p2 .search (line .attrib ['title' ]).group (1 ).split ()
8178 except AttributeError :
8279 baseline = [ 0 , 0 ]
8380 linebox = [float (i ) for i in linebox ]
8481 baseline = [float (i ) for i in baseline ]
85- for word in line :
86- if word .attrib ['class' ] != 'ocrx_word' :
87- continue
88- if word .text is not None :
89- rawtext = word .text .strip ()
90- else :
91- try :
92- innerword = word [0 ]
93- if innerword .text is not None :
94- rawtext = innerword .text .strip ()
95- else :
96- continue
97- except :
82+ for word in line .xpath ('.//*[@class="ocrx_word"]' ):
83+ rawtext = word .text_content ().strip ()
84+ if rawtext == '' :
9885 continue
9986 font_width = pdf .stringWidth (rawtext , 'invisible' , 8 )
10087 if font_width <= 0 :
0 commit comments