142142 check-replacement-character, check-empty-line
143143"""
144144from binascii import hexlify , unhexlify
145+ from collections import deque
145146from glob import glob
146147from html import unescape
147148from inspect import cleandoc
171172from unidecode import unidecode
172173
173174
174- version = '4.5.0 '
175+ version = '4.5.1 '
175176
176177# Search from start to finish for the string $HEX[], with block of a-f0-9 with even number
177178# of hex chars. The first match group is repeated.
@@ -1016,8 +1017,16 @@ def clean_up(lines):
10161017 """
10171018 results = []
10181019 log = []
1020+ processed_lines = set ()
1021+ work_queue = deque (lines )
1022+
1023+ while work_queue :
1024+ line = work_queue .popleft ()
1025+
1026+ if line in processed_lines :
1027+ continue
1028+ processed_lines .add (line )
10191029
1020- for line in lines :
10211030 # Check if the limit is set, if so minus 1 and if 0 is reached lets quit.
10221031 if type (config ['limit' ]) is int :
10231032 if config ['limit' ] > 0 :
@@ -1057,7 +1066,7 @@ def clean_up(lines):
10571066 if status :
10581067 # Lines contains hex, this function will return binary string, so add it back to
10591068 # our undecoded lines
1060- lines .append (line_decoded )
1069+ work_queue .append (line_decoded )
10611070 if config ['debug' ]:
10621071 log .append (f'Clean_hex; replaced $HEX[], added to queue and quiting; { line } { linesep } ' )
10631072 # Aborting future processing of this line.
@@ -1069,7 +1078,7 @@ def clean_up(lines):
10691078 if status :
10701079 # Line contains html string, because this can be binary data (linefeeds etc)
10711080 # convert back to binary string and add to queue again.
1072- lines .append (line_decoded .encode ())
1081+ work_queue .append (line_decoded .encode ())
10731082 if config ['debug' ]:
10741083 log .append (f'Clean_html; replaced html, added to queue and quiting; { line_decoded } { linesep } ' )
10751084 stop = True
@@ -1283,49 +1292,49 @@ def clean_up(lines):
12831292 for modified_line in modified_lines :
12841293 if config ['debug' ]:
12851294 log .append (f'Add_split; new line because of split; { modified_line } { linesep } ' )
1286- lines .append (modified_line .encode ())
1295+ work_queue .append (modified_line .encode ())
12871296
12881297 if config .get ('add-lower' ):
12891298 modified_line = add_lower (line_decoded )
12901299 if modified_line :
12911300 if config ['debug' ]:
12921301 log .append (f'Add_lower; new line; { modified_line } { linesep } ' )
1293- lines .append (modified_line .encode ())
1302+ work_queue .append (modified_line .encode ())
12941303
12951304 if config .get ('add-first-upper' ):
12961305 modified_line = add_first_upper (line_decoded )
12971306 if modified_line :
12981307 if config ['debug' ]:
12991308 log .append (f'Add_first_upper; new line; { modified_line } { linesep } ' )
1300- lines .append (modified_line .encode ())
1309+ work_queue .append (modified_line .encode ())
13011310
13021311 if config .get ('add-title-case' ):
13031312 modified_line = add_title_case (line_decoded )
13041313 if modified_line :
13051314 if config ['debug' ]:
13061315 log .append (f'Add_title_case; new line; { modified_line } { linesep } ' )
1307- lines .append (modified_line .encode ())
1316+ work_queue .append (modified_line .encode ())
13081317
13091318 if config .get ('add-latin-ligatures' ):
13101319 modified_line = add_latin_ligatures (line_decoded )
13111320 if modified_line :
13121321 if config ['debug' ]:
13131322 log .append (f'Add_latin_ligatures; new line; { modified_line } { linesep } ' )
1314- lines .append (modified_line .encode ())
1323+ work_queue .append (modified_line .encode ())
13151324
13161325 if config .get ('add-umlaut' ):
13171326 status , modified_line = clean_add_umlaut (line_decoded )
13181327 if status :
13191328 if config ['debug' ]:
13201329 log .append (f'Add_umlaut; new line; { modified_line } { linesep } ' )
1321- lines .append (modified_line .encode ())
1330+ work_queue .append (modified_line .encode ())
13221331
13231332 if config .get ('add-without-punctuation' ):
13241333 modified_line = add_without_punctuation (line_decoded , config .get ('punctuation' ))
13251334 if modified_line :
13261335 if config ['debug' ]:
13271336 log .append (f'Add_without_punctuation; new line; { modified_line } { linesep } ' )
1328- lines .append (modified_line .encode ())
1337+ work_queue .append (modified_line .encode ())
13291338
13301339 if config ['debug' ]:
13311340 log .append (f'----End---- { line_decoded } { linesep } { linesep } ' )
0 commit comments