-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdocimg2mmax.py
More file actions
120 lines (109 loc) · 7.51 KB
/
Copy pathdocimg2mmax.py
File metadata and controls
120 lines (109 loc) · 7.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import sys, argparse, shutil, os, subprocess
from glob import glob
from pymmax2.pyMMAX2 import *
from multiprocessing import Process
from pathlib import Path
from docimg2mmax_lib import decolor_image, get_predominant_colors, get_chunks, get_line_type, ocrwords_to_lines
from docimg2mmax_lib import png_to_hocr, hocr_to_mmax2, analyse_hocr_line_span, extract_markup, create_html_document
from docimg2mmax_lib import extract_drawable_markup
# This will be called in parallel for chunks of image folders.
# Images of the same document will not be split across chunks.
def docimg2mmax_worker(args, folder_chunk, mmax2_target_folder, proc_namespace):
for img_folder_name in folder_chunk:
if args.verbose:
print("\nProcessing images in folder "+os.path.abspath(img_folder_name), file=sys.stderr)
# Use name of folder with images as base for MMAX2 project name
# If .mmax file exists, it will not be re-created.
mmax2_proj_name = create_mmax2_stub(os.path.basename(img_folder_name), mmax2_target_folder,
clear_basedata=True, clear_levels=['ocr_words', 'ocr_lines'], verbose=False)
mmax2_disc = MMAX2Discourse(mmax2_proj_name, verbose=False)
mmax2_disc.load_markables(verbose=False)
if args.verbose:
print(" Creating "+os.path.abspath(mmax2_proj_name), file=sys.stderr)
pages=None
# pages is 1-based,so 0 means all
if args.pages != "0":
pages=map(int,args.pages.split(","))
# Sort numerically by page number in order to keep correct import order.
# This will only find *original* files in the image folder, not those in the /decol**/ sub-folder
# Choice of decol images is controlled in the png_to_hocr method
png_files = [a[0] for a in
sorted([(f, int( os.path.basename(f)[os.path.basename(f).rfind('-')+1:os.path.basename(f).rfind('.')] ))
for f in glob(img_folder_name+os.path.sep+"*.*") if f.lower().endswith('.png')], key=itemgetter(1))]
for page_idx, png_file in enumerate(png_files):
page_no = page_idx+1
if pages and page_no not in pages:
print("Skipping page no "+str(page_no), file=sys.stderr)
continue
hocr = png_to_hocr(png_file,
['--oem',args.oem,'--psm',args.psm,'--dpi',args.dpi,'-c','tessedit_create_hocr=1','-c','hocr_char_boxes=1','--tessdata-dir',args.tessdata_dir],
args.tmp_path+os.path.sep+proc_namespace+"_tessout.tmp",
normalize_unicode=True, decolor=args.decolor_for_ocr, grey_thresh=50, black_thresh=100, verbose=args.verbose)
hocr_to_mmax2(hocr, page_no, mmax2_disc, os.path.basename(png_file),
ignore_empty_chars=True, split_merged_chars=True, normalize_variants=False, separate_numbers=args.separate_numbers, verbose=args.verbose)
if args.detect_markup:
extract_markup(png_file, mmax2_disc, page_no, vertical=True, horizontal=True,
grey_thresh=int(args.markup_grey_threshold), marked_thresh=int(args.markup_marked_threshold), verbose=args.verbose)
if not args.detect_markup:
# Set markup default
for wo in [a for a in mmax2_disc.get_level('ocr_words').get_markables()]:
wo.update_attributes({'markup': '0'})
else:
# Create HTML page
drawable_markup_per_page=extract_drawable_markup(png_files, mmax2_disc, args.min_markup_percentage)
create_html_document(drawable_markup_per_page, args.html_target_folder+os.path.sep+os.path.basename(mmax2_proj_name)+".html", mmax2_disc, margin_width=1500, scale_by=2)
mmax2_disc.get_basedata().write(dtd_base_path='"', overwrite=True)
mmax2_disc.get_level('ocr_words').write(to_path=mmax2_disc.get_mmax2_path()+mmax2_disc.get_markable_path(), overwrite=True, no_backup=True)
mmax2_disc.get_level('ocr_lines').write(to_path=mmax2_disc.get_mmax2_path()+mmax2_disc.get_markable_path(), overwrite=True, no_backup=True)
mmax2_disc.get_level('text_words').write(to_path=mmax2_disc.get_mmax2_path()+mmax2_disc.get_markable_path(), overwrite=True, no_backup=True)
print(mmax2_disc.info(), file=sys.stderr)
def docimg2mmax(args):
# Create folders, if neccessary
if not os.path.exists(args.tmp_path):
os.mkdir(args.tmp_path)
if not os.path.exists(args.html_target_folder):
os.makedirs(args.html_target_folder)
# Create target data template, if neccessary
mmax2_stub_path = "./data/stub/"
if not os.path.exists(args.mmax2_target_folder+os.path.sep+"MMAX2"+os.path.sep):
# The *complete* folder does not exist yet
if not os.path.exists(args.mmax2_target_folder):
os.makedirs(args.mmax2_target_folder)
shutil.copytree(mmax2_stub_path, args.mmax2_target_folder, dirs_exist_ok=True)
mmax2_target_folder = args.mmax2_target_folder+os.path.sep+"MMAX2"+os.path.sep
# Start processing
procs=[]
for folder_chunk in get_chunks(sorted([os.path.dirname(f) for f in glob(args.img_folders)]), int(args.workers)):
p = Process(target=docimg2mmax_worker, args=(args, folder_chunk, mmax2_target_folder, str(os.getpid())+"_"+str(len(procs))))
p.start()
procs.append(p)
for p in procs: p.join()
#############################################
if __name__ == '__main__':
#############################################
parser = argparse.ArgumentParser()
# Folders of PNG files to be imported. Use quotation marks if argument contains a wildcard: "./imagefile/*/"
# We expect one folder per document with the individual page file names to *end with* -01.png,
# so that we can extract the page number. This is the format produced by the pdf2png tool.
parser.add_argument('--img_folders', default=None, required=True)
# This is the path to the folder *containing* the MMAX2 folder!
parser.add_argument('--mmax2_target_folder', default=None, required=True)
parser.add_argument('--html_target_folder', default ="."+os.path.sep+"html"+os.path.sep)
# For parallelization
parser.add_argument('--workers', default="1")
parser.add_argument('--pages', default="0")
parser.add_argument('--verbose', default=False, dest='verbose', action='store_true')
parser.add_argument('--tmp_path', default="."+os.path.sep+"tmp"+os.path.sep)
parser.add_argument('--no_decolor_for_ocr', default=True, dest='decolor_for_ocr', action='store_false')
parser.add_argument('--detect_markup', default=False, dest='detect_markup', action='store_true')
parser.add_argument('--markup_grey_threshold', default="10")
parser.add_argument('--markup_marked_threshold', default="10")
parser.add_argument('--min_markup_percentage', default="40")
# For tokenization
parser.add_argument('--no_separate_numbers', default=True, dest='separate_numbers', action='store_false')
# tesseract parameters. Stick to actual parameter names (except _ for -)
parser.add_argument('--tessdata_dir', default=None, required=True)
parser.add_argument('--oem', default=None, required=True)
parser.add_argument('--psm', default=None, required=True)
parser.add_argument('--dpi', default=None, required=True)
docimg2mmax(parser.parse_args())