forked from nyu-mll/jiant
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsplit_constituent_data.py
More file actions
executable file
·64 lines (47 loc) · 1.71 KB
/
split_constituent_data.py
File metadata and controls
executable file
·64 lines (47 loc) · 1.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python
# Helper script to split constituent data into POS and nonterminal groups.
#
# TODO to integrate this into the OntoNotes processing script to generate in
# one shot.
#
# Usage:
# python split_constituent_data.py /path/to/edge/probing/data/*.json
import copy
import json
import logging as log
import os
import sys
from tqdm import tqdm
from jiant.utils import utils
log.basicConfig(format="%(asctime)s: %(message)s", datefmt="%m/%d %I:%M:%S %p", level=log.INFO)
def split_record(record):
pos_record = copy.deepcopy(record)
non_record = copy.deepcopy(record)
pos_record["targets"] = [t for t in record["targets"] if t["info"]["height"] == 1]
non_record["targets"] = [t for t in record["targets"] if t["info"]["height"] > 1]
return (pos_record, non_record)
def split_file(fname):
dirname, base = os.path.split(fname)
pos_dir = os.path.join(dirname, "pos")
os.makedirs(pos_dir, exist_ok=True)
new_pos_name = os.path.join(pos_dir, base)
non_dir = os.path.join(dirname, "nonterminal")
os.makedirs(non_dir, exist_ok=True)
new_non_name = os.path.join(non_dir, base)
log.info("Processing file: %s", fname)
record_iter = list(utils.load_json_data(fname))
log.info(" saving to %s and %s", new_pos_name, new_non_name)
pos_fd = open(new_pos_name, "w")
non_fd = open(new_non_name, "w")
for record in tqdm(record_iter):
pos_record, non_record = split_record(record)
pos_fd.write(json.dumps(pos_record))
pos_fd.write("\n")
non_fd.write(json.dumps(non_record))
non_fd.write("\n")
def main(args):
for fname in args:
split_file(fname)
if __name__ == "__main__":
main(sys.argv[1:])
sys.exit(0)