-
Notifications
You must be signed in to change notification settings - Fork 21
Expand file tree
/
Copy pathconcatenate.py
More file actions
executable file
·74 lines (66 loc) · 2.19 KB
/
Copy pathconcatenate.py
File metadata and controls
executable file
·74 lines (66 loc) · 2.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python
"""Concatenates source and features columns.
Note that:
* Output is to stdout to allow chaining.
* The source string in the output TSV will have space as its separator.
* Features will be kept distinct using `[...]`.
* Other columns are ignored; they can be merged back in using UNIX `cut` and
`paste`.
"""
import argparse
import csv
import os
import sys
from yoyodyne import defaults
from yoyodyne.data import tsv
def main(args: argparse.Namespace) -> None:
parser = tsv.TsvParser(
source_col=args.source_col,
features_col=args.features_col,
target_col=0,
source_sep=args.source_sep,
features_sep=args.features_sep,
)
assert parser.has_features
# This is really just a one-column TSV but we want to make sure
# escaping is handled properly.
tsv_writer = csv.writer(
sys.stdout, delimiter="\t", lineterminator=os.linesep
)
for source, features in parser.samples(args.input_tsv):
source.extend(features)
tsv_writer.writerow([" ".join(source)])
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("input_tsv")
parser.add_argument(
"--source_col",
type=int,
default=defaults.SOURCE_COL,
help="1-indexed column for the input/output source column. "
"Default: %(default)s.",
)
parser.add_argument(
"--features_col",
type=int,
default=defaults.TARGET_COL,
help="1-indexed column for the input features column. "
"Default: %(default)s.",
)
parser.add_argument(
"--source_sep",
type=str,
default=defaults.SOURCE_SEP,
help="String used to split the input source string into symbols; "
"an empty string indicates that each Unicode codepoint "
"is its own symbol. Default: %(default)r.",
)
parser.add_argument(
"--features_sep",
type=str,
default=defaults.FEATURES_SEP,
help="String used to split features string into symbols; "
"an empty string indicates that each Unicode codepoint "
"is its own symbol. Default: %(default)r.",
)
main(parser.parse_args())