Merge pull request #57 from kba/pdf-xml-parsing

zuphilip · web-flow · commit b482964926ca · 2016-09-17T14:22:27.000+02:00
diff --git a/hocr-pdf b/hocr-pdf
@@ -29,7 +29,7 @@ from PIL import Image
 from reportlab.pdfgen.canvas import Canvas
 from reportlab.pdfbase import pdfmetrics
 from reportlab.pdfbase.ttfonts import TTFont
-from xml.etree.ElementTree import ElementTree, ParseError
+from lxml import etree, html
 
 class StdoutWrapper:
     """
@@ -70,31 +70,18 @@ def add_text_layer(pdf, image, height, dpi):
   p1 = re.compile('bbox((\s+\d+){4})')
   p2 = re.compile('baseline((\s+[\d\.\-]+){2})')
   hocrfile = os.path.splitext(image)[0] + ".hocr"
-  hocr = ElementTree()
-  hocr.parse(hocrfile)
-  for line in hocr.findall(".//{http://www.w3.org/1999/xhtml}span"):
-    if line.attrib['class'] != 'ocr_line':
-      continue
+  hocr = etree.parse(hocrfile, html.XHTMLParser())
+  for line in hocr.xpath('//*[@class="ocr_line"]'):
     linebox = p1.search(line.attrib['title']).group(1).split()
     try:
       baseline = p2.search(line.attrib['title']).group(1).split()
     except AttributeError:
       baseline = [ 0, 0 ]
     linebox = [float(i) for i in linebox]
     baseline = [float(i) for i in baseline]
-    for word in line:
-      if word.attrib['class'] != 'ocrx_word':
-        continue
-      if word.text is not None:
-        rawtext = word.text.strip()
-      else:
-        try:
-          innerword = word[0]
-          if innerword.text is not None:
-            rawtext = innerword.text.strip()
-          else:
-            continue  
-        except:
+    for word in line.xpath('.//*[@class="ocrx_word"]'):
+      rawtext = word.text_content().strip()
+      if rawtext == '':
           continue
       font_width = pdf.stringWidth(rawtext, 'invisible', 8)
       if font_width <= 0: