-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtagremovescript.py
96 lines (77 loc) · 4.91 KB
/
tagremovescript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import re
from collections import OrderedDict
remove_weight_syntax = True
remove_slashes = True
dedupe_tags = True
remove_tis_and_loras = True
def prune_tags(directory, tag_list):
# load list of strings to be removed
with open(tag_list, 'r') as f:
tags = [line.rstrip() for line in f]
# create set of escaped tags
escaped_tags = {re.escape(tag) for tag in tags}
# recursive traversal
for root, dirs, files in os.walk(directory):
# loop through all the files in dir
for filename in files:
# check if the file is .txt
if filename.endswith('.txt'):
# read .txt contents
with open(os.path.join(root, filename), 'r', encoding='utf8') as f:
content = f.read()
if tag_list:
try:
pattern = r'\b(' + '|'.join(escaped_tags) + r')\b' # word boundaries
content = re.sub(pattern, '', content) # list-based removal
if not remove_weight_syntax:
content = re.sub(r"\s+(?=:\d.\d+)|\s+(?=[)\]])", "", content) # move back weighting
# after blacklist
content = re.sub(r"\(\s+(?=[\w\"'\\])", r"(", content) # move back weighting after blacklist
content = re.sub(r"\[\s+(?=[\w\"'\\])", r"[", content) # move back weighting after blacklist
content = re.sub(r"(?<![\w!?\"'.;])(:\d.\d+)|(?<![\w!?\"'.;\]])([)\]]\d.\d+)|"
r"(?<![\D!?\"'.;])([)\]]+[+-]+)|(?<=,)(\s+[+-]+)|"
r"(?<=,)([+-]+)", "", content) # floating numeric/+- weighting
content = re.sub(r"[([]+(?![\w\"'])", "", content) # floating first bracket/parens
content = re.sub(r"[\[(]+(?!\w+)[)\]]+", "", content) # floating bracket/parens
except OSError as t:
print(".txt blacklist not found, or something else got messed up. idk man idk what I'm doing here. "
"here's your error:")
print(t)
content = re.sub("\n+", r", ", content) # remove newlines they mess things up
if remove_tis_and_loras:
content = re.sub(r"<.+>", "", content)
content = re.sub(r"withLora.+,\d.\d+\)", "", content)
if remove_weight_syntax:
content = re.sub(r"[)\]]\d.\d+", "", content) # InvokeAI numeric weight syntax
content = re.sub(r":\d.\d+[)\]]", "", content) # Automatic1111-style numeric weight syntax
content = re.sub(r"[+)(\[\]]+", "", content) # plusses, brackets, parens
content = re.sub(r"(?<=\w)(-+)(?!\w)|(?<!\S)(-+)(?!\s\w)", "", content) # remove hyphens not
# between characters
if remove_slashes:
content = re.sub(r"\\+", "", content) # slashes
content = re.sub(r"\s+(?=[.,!?;:])", "", content) # delete whitespace before punctuation
content = re.sub(r"\s+(?=['\"*](?!\w))", "", content) # delete whitespace before apostrophes and
# quotation mark and *
content = re.sub(r"([,;?!])(?![\\,!?.\s\"*':;)\]])|"
r"(?<![A-Z\d])(\.)(?![\d\s\\,!?.\"*':;)\]])(?![a-z].)|(?<=[A-Z])(\.)(?=[a-z])|"
r"(?<=[a-z])(\.)(?=[A-Z])|(:)(?!\d\.\d)(?![\s\"'])(?!\d\d)|"
r"(?<=[\"'])([.,;!?:])(?=[\"'])|"
r"([.,;!?:\w])(?=\"\w)", r"\g<0> ", content) # punctuation
# spacing and whitespace fix
content = re.sub(r"\s+-+(?=\w)|(?<=[.,;!?:])(-+)(?=\w)", " ", content) # sometimes - behind text
# when text deleted
if not remove_slashes:
content = re.sub(r"\\+(?!\")", "", content) # random slashes not behind quotations
content = re.sub(r",+(,)", r"\1", content) # extra commas
if dedupe_tags:
words = content.strip().split(', ')
unique_words = list(OrderedDict.fromkeys(words))
content = ", ".join(unique_words)
content = re.sub(r"^\s*,|,\s*$", "", content) # trailing/leading whitespace and commas
content = re.sub(r"\s+", " ", content) # cut multiple spaces to one
# overwrite the original file with the cleaned version
with open(os.path.join(root, filename), 'w') as f:
f.write(content)
# use
prune_tags(r"F:\garb\Garb Bin New", r"F:\removenightmaretags.txt")