This repository was archived by the owner on Jun 22, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess_texts.py
More file actions
44 lines (37 loc) · 1.38 KB
/
preprocess_texts.py
File metadata and controls
44 lines (37 loc) · 1.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import argparse
def convert(text: str) -> str:
text = " ".join(text.replace(" ", "$"))
text = text.replace("[", "P").replace("]", "Q")
return text
def reverse_convert(text: str) -> str:
text = "".join(text.strip().split(" "))
text = text.replace("$", " ")
text = text.replace("P", "[")
text = text.replace("Q", "]")
return text
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("infile")
parser.add_argument("outfile")
parser.add_argument("--in-uttids", action="store_true")
parser.add_argument("--out-uttids", action="store_true")
parser.add_argument("--reverse", action="store_true")
args = parser.parse_args()
with open(args.infile, "r", encoding="utf-8") as f_in, \
open(args.outfile, "w", encoding="utf-8") as f_out:
for line in f_in:
text = line.strip()
if not text:
continue
uttid = ""
if args.in_uttids:
try:
uttid, text = text.split(maxsplit=1)
except ValueError:
uttid = text
text = ""
text_converted = reverse_convert(text) if args.reverse else convert(text)
if args.out_uttids:
print(uttid, text_converted, file=f_out)
else:
print(text_converted, file=f_out)