This repository was archived by the owner on Jan 13, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 259
/
Copy pathpull.py
48 lines (45 loc) · 1.57 KB
/
pull.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#
# Copyright (c) 2015, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. An additional grant
# of patent rights can be found in the PATENTS file in the same directory.
#
# Author: Alexander M Rush <[email protected]>
# Sumit Chopra <[email protected]>
# Jason Weston <[email protected]>
"""
Pull out elements of the title-article file.
"""
import sys
#@lint-avoid-python-3-compatibility-imports
words_dict = set([l.split()[0]
for l in open(sys.argv[2])])
for l in sys.stdin:
splits = l.strip().split("\t")
if len(splits) != 4:
continue
title_parse, article_parse, title, article = l.strip().split("\t")
if sys.argv[1] == "src":
print(article)
elif sys.argv[1] == "trg":
print(title)
elif sys.argv[1] == "src_lc":
words = [w if w in words_dict else "<unk>"
for w in article.lower().split()]
print(" ".join(words))
elif sys.argv[1] == "trg_lc":
t = title.lower()
words = [w if w in words_dict else "<unk>"
for w in t.split()
if w not in ['"', "'", "''", "!", "=", "-",
"--", ",", "?", ".",
"``", "`", "-rrb-", "-llb-", "\\/"]]
print(" ".join(words))
elif sys.argv[1] == "srctree":
print(article_parse)
elif sys.argv[1] == "interleave":
# Format needed for T3
print(article_parse)
print(title_parse)