-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcheck_complex_script.py
More file actions
157 lines (130 loc) · 4.79 KB
/
check_complex_script.py
File metadata and controls
157 lines (130 loc) · 4.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/python3
#-*- coding: utf-8 -*-
# Read Unicode documents and check the data on the complex scripts found.
import glob
from io import BytesIO
import os
import sys
import docx
from docx import Document
from docx.oxml.shared import OxmlElement, qn
from convertDoc2 import ConvertDocx
# CONSTANT
default_langCode = 'phk'
# get uploaded file into document form
def createDocFromFile(file_path):
try:
file = open(file_path, 'rb')
text = file.read()
data = BytesIO(text)
count = len(text)
doc = Document(data)
file.close()
return doc, count
except BaseException as err:
print('Cannot create Docx for %s. Err = %s' % (file_path, err))
return None, -1
def fix_cs_formatting_run(run_to_fix, user_cs_font_size, user_cs_font_name,
user_is_bold=None, langCode='phk', is_bidi=False):
# Start solving the font size and name problem
# https://stackoverflow.com/questions/45627652/python-docx-add-style-with-ctl-complex-text-layout-language
#cs: complex script, ex, arabic
rpr = run_to_fix.element.get_or_add_rPr()
rFonts = rpr.get_or_add_rFonts()
rpr.get_or_add_sz()
szCs = OxmlElement('w:szCs') # size
sz= OxmlElement('w:sz') # size
if is_bidi:
rtl = OxmlElement('w:rtl') # If this is a right-to-left
rpr.append(rtl)
rpr.append(szCs)
rpr.append(sz)
lang = OxmlElement('w:lang') #language
rpr.append(lang)
if user_is_bold:
# If bolding is desired
bCs = OxmlElement('w:bCs') #bold the complex language
rpr.append(bCs)
bCs.set(qn('w:val'), "True")
b = OxmlElement('w:b') # bold the english
rpr.append(b)
b.set(qn('w:val'), "True")
sz.set(qn('w:val'), str(int(user_cs_font_size * 2)))
szCs.set(qn('w:val'), str(int(user_cs_font_size * 2)))
lang.set(qn("w:val"), langCode)
lang.set(qn('w:bidi'), langCode) # This depends on the language code
rFonts.set(qn('w:cs'), user_cs_font_name)
rFonts.set(qn('w:ascii'), user_cs_font_name)
rFonts.set(qn('w:hAnsi'), user_cs_font_name)
rFonts.set(qn('w:eastAsia'), user_cs_font_name)
# Set script as complex
run_to_fix.font.complex_script = True
def fix_paragraph_runs(para, user_cs_font_name=None, user_cs_font_size=12):
runs = para.runs
user_is_bold = False
for run in runs:
font_name = run.font.name
if font_name == user_cs_font_name:
fix_cs_formatting_run(run, user_cs_font_size, user_cs_font_name,
user_is_bold) # cs: complex script, ex, arabic
continue
def checkComplex(lang, input_path, document=None, save_doc=False):
if not Document:
document, count = createDocFromFile(input_path)
user_cs_font_size = 12
user_cs_font_name = 'Arial' # Default
if lang == 'phk':
user_cs_font_size = 12
user_cs_font_name = 'PhakeRamayanaUnicode'
# get the paragraphs
paragraphs = document.paragraphs
paragraphId = 0
for para in paragraphs:
# print('!!! Paragraph # %s: %s' % (paragraphId, para.text))
# Now this is specialized for phk
fixParagraphRuns(para, user_cs_font_name=user_cs_font_name, user_cs_font_size=user_cs_font_size)
paragraphId += 1
# Now check tables
tables = document.tables
tableId = 0
for table in tables:
#self.converter.current_table = table # To help with setting font sizes.
# print('!!! TABLE %d' % (tableId))
tableId += 1
# if self.progressObj:
# self.progressObj.send('Table %d, %d rows' % (tableId, len(table.rows)))
row_id = 0
rows = table.rows
for row in rows:
row_id += 1
for cell in row.cells:
paragraphs = cell.paragraphs
for para in paragraphs:
fixParagraphRuns(para)
if save_doc:
new_doc_name = input_path.replace('_Unicode', '_UnicodeFixed')
document.save(new_doc_name)
return
def main(argv):
if len(argv) < 3:
print('Convert .docx files from font encodings to Unicode text')
print('Usage: python3 command_line lang_code file1 file2 file ...')
return
lang = argv[1]
# For each item in the list, [2:...]
files = []
for doc_path in argv[2:]:
if os.path.isdir(doc_path):
# Expand with glob
files.extend(glob.glob(doc_path + "/*.docx"))
else:
files.append(doc_path)
for file_path in files:
unicode_in_name = file_path.find('_Unicode.')
if unicode_in_name < 0:
# Only look at Unicode converted files
continue
print('Checking complex scripts %s in document %s' % (lang, file_path))
checkComplex(lang, file_path, save_doc=True)
if __name__ == '__main__':
main(sys.argv)