This repository was archived by the owner on Jan 13, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 259
/
Copy pathprocess_agiga.py
executable file
·90 lines (72 loc) · 2.39 KB
/
process_agiga.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#
# Copyright (c) 2015, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. An additional grant
# of patent rights can be found in the PATENTS file in the same directory.
#
# Author: Alexander M Rush <[email protected]>
# Sumit Chopra <[email protected]>
# Jason Weston <[email protected]>
#/usr/bin/env python
import sys
import os
import re
import gzip
#@lint-avoid-python-3-compatibility-imports
# Make directory for output if it doesn't exist
try:
os.mkdir(sys.argv[2] + "/" + sys.argv[1].split("/")[-2])
except OSError:
pass
# Strip off .gz ending
end = "/".join(sys.argv[1].split("/")[-2:])[:-len(".xml.gz")] + ".txt"
out = open(sys.argv[2] + end, "w")
# Parse and print titles and articles
NONE, HEAD, NEXT, TEXT = 0, 1, 2, 3
MODE = NONE
title_parse = ""
article_parse = []
# FIX: Some parses are mis-parenthesized.
def fix_paren(parse):
if len(parse) < 2:
return parse
if parse[0] == "(" and parse[1] == " ":
return parse[2:-1]
return parse
def get_words(parse):
words = []
for w in parse.split():
if w[-1] == ')':
words.append(w.strip(")"))
if words[-1] == ".":
break
return words
def remove_digits(parse):
return re.sub(r'\d', '#', parse)
for l in gzip.open(sys.argv[1]):
if MODE == HEAD:
title_parse = remove_digits(fix_paren(l.strip()))
MODE = NEXT
if MODE == TEXT:
article_parse.append(remove_digits(fix_paren(l.strip())))
if MODE == NONE and l.strip() == "<HEADLINE>":
MODE = HEAD
if MODE == NEXT and l.strip() == "<P>":
MODE = TEXT
if MODE == TEXT and l.strip() == "</P>":
articles = []
# Annotated gigaword has a poor sentence segmenter.
# Ensure there is a least a period.
for i in range(len(article_parse)):
articles.append(article_parse[i])
if "(. .)" in article_parse[i]:
break
article_parse = "(TOP " + " ".join(articles) + ")"
# title_parse \t article_parse \t title \t article
print >>out, "\t".join([title_parse, article_parse,
" ".join(get_words(title_parse)),
" ".join(get_words(article_parse))])
article_parse = []
MODE = NONE