lcfrs-inducer/inducer.py at main · TyrantNeptune/lcfrs-inducer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
import getopt

class Grammar:
	"""
	A class for LCFRS-grammars.

	Attributes:
	 self.rules: a list of Rule objects

	Functions:
	 self.__init__(f): from an inputfile "f" in NeGra-format collect a list of trees, then for each tree call the function self.induce
	 self.induce(tree): induce LCFRS rules from a tree "tree" and add them to self.rules if they are not a duplicate of a rule already in there
	 self.pprint(): print the form of each rule in self.rules in alphabetical order
	 self.pwrite(output): write the form of each rule in self.rules on outputfile "output" in alphabetical order
	"""
	def __init__(self,f):
		"""
		Collects a list of trees from a treebank file, then calls the function self.induce for each tree.

		Input: a treebank file "f" in NeGra-format
		"""
		self.rules = []
		with open(f, 'r') as fileObject:
			lines = fileObject.read().splitlines()
		trees = []
		for line in lines:
			if line[:4] == "#BOS":
				tree = [] #Initializing a new treelist at beginning of sequence (#BOS)
			elif line[:4] == "#EOS":
				index = line[5:]
				trees.append(tree) #Appending the tree to the list of trees at end of sequence (#EOS)
			else:
				tree.append(line) #Putting nodes (lines) in the treelist
		count = 0
		for tree in trees:
			self.induce(tree)
			count +=1
			if count%100==0: #counter to check the progress
				print(str(count)+" trees complete")
		print("induction complete")

	def induce(self,tree):
		"""
		Induces LCFRS rules from a tree "tree" and add them to self.rules if they are not a duplicate of a rule already in there.

		Input: a list of lines representing nodes from a tree in NeGra-format
		"""
		nodes = {} #we put the nodes in a dictionary so it becomes easy to look them up by their id
		leaves = []
		for line in tree:
			node = Node(line.split("\t"))
			if node.id[0] == "#":
				nodes[node.id[1:]]=node #the "#" is sliced off since it also doesn't appear in node.mother
			else: #if the nodeID doesn't start with "#", the nodeID is instead the word on the input string, and therefore this node is a leaf node
				leaves.append(node)
		for s in range(len(leaves)):
			leaves[s].strID=[s]
			"""Since we're looping over the leaves anyway, we're setting the string ID right here.
			Since the words are in order of their appearance the string ID matches the position in the list of leaves"""
			mother = nodes[leaves[s].mother]
			mother.daughters.append(leaves[s]) #As should be obvious, a node is one of its mother's daughter nodes
		for node in nodes.values():
			if node.mother != "0": #O is the ID of a root node's mother. Since a root node is not the daugther of any node, the following lines do not apply to them
				mother = nodes[node.mother]
				mother.daughters.append(node)
		nodes2 = [] #now that we have found the mother nodes, we don't need a dictionary anymore, so we are going to collect our nodes in a list
		nodesval = list(nodes.values()) #We go from a dict to a list because we want to edit the data structure while looping
		while nodesval != []: #only stop this loop if we are sure that all nodes have their string ID
			for node in nodesval:
				if node.allDaughtersHaveStrID(): #Do not get the string ID until all daughter nodes have their string ID's
					node.getStrID()
					nodes2.append(node)
					#When the string ID is obtained, move the node from nodesval to nodes2
					nodesval.remove(node)
		for leaf in leaves:
			newrule = Rule(leaf)
			if self.rules ==[]: #if this is the first rule, we don't need to check if it's a duplicate
				self.rules.append(newrule)
			else:
				isDupl = False
				for rule in self.rules:
					if newrule.form == rule.form: #if two rules have the same form, they are duplicates of eachother, so the new one should not be added to the grammar
						isDupl = True
						break
				if not isDupl:
					self.rules.append(newrule)
		for node in nodes2:
			newrule = Rule(node)
			if self.rules ==[]:
				self.rules.append(newrule)
			else:
				isDupl = False
				for rule in self.rules:
					if newrule.form == rule.form:
						isDupl = True
						break
				if not isDupl:
					self.rules.append(newrule)

	def pprint(self):
		"""
		Prints the form of each rule in self.rules in alphabetical order
		"""
		prules=[] #We make a list so that we can sort the rules
		for rule in self.rules:
			prules.append(rule.form)
		prules.sort()
		for rule in prules:
			print(rule)

	def pwrite(self, output):
		"""
		Writes the form of each rule in self.rules on outputfile "output" in alphabetical order

		Input: a file to write the rules onto
		"""
		prules=[] #We make a list so that we can sort the rules
		for rule in self.rules:
			prules.append(rule.form)
		prules.sort()
		with open(output, 'w') as fileObject:
			for rule in prules:
				fileObject.write(rule+"\n")

class Rule:
	"""
	A class for LCFRS-rules.

	Attributes:
	 self.lhs: a Predicate object representing the left-hand side of the rule
	 self.rhs: a list of Predicate objects, or the string "eps" representing the right-hand side of the rule
	 self.form: a string representation of the rule

	Functions:
	 self.__init__(node): From a node object, set the lhs as a predicate with the node lable as its name and the node's arguments as its argument. Set the rhs as a list of predicates based on the node's daughters in a similar fashion. Then contract the lhs and rhs arguments according to the induction algorithm, and finally set self.form as the string representation of the rule by calling self.makeform().
	 self.mkform(): Sets self.form the string representation of the rule
	"""
	def __init__(self,node):
		"""
		Sets, from a node object, the lhs as a predicate with the node lable as its name and the node's arguments as its argument, then sets the rhs as a list of predicates based on the node's daughters in a similar fashion, then contracts the lhs and rhs arguments according to the induction algorithm, and finally sets self.form as the string representation of the rule by calling self.makeform()

		Input: a Node object that either is a leaf node or has all of its daughters listed
		"""
		if node.id[0]!="#": #Again, we check if the node is a leaf node
			self.lhs = Predicate(node.label,[node.id]) #The left hand side of an LCFRS rule consits of a predicate, with the node label as their name. For terminal rules, the argument is the word on the input string
			self.rhs = "eps" #Terminal rules have an empty string (represented by "eps" for epsilon) as their right-hand side
		else:
			args = node.findArgs()
			self.lhs = Predicate(node.label,args)
			self.rhs = []
			for daughter in node.daughters: #The right-hand side of non-terminal rules consists of a list of predicates that are based on the predicates of the daughter nodes.
				args = daughter.findArgs()
				self.rhs.append(Predicate(daughter.label,args))
			count = 0 #We start a new counter to make sure the variables in the arguments of a rule have different names
			for pred in self.rhs:
				"""Each right-hand side argument needs only to be represented by a single variable.
				We then need to make sure that same variable appears on the left-hand side at the same position"""
				for i in range(len(pred.args)):
					for j in range(len(self.lhs.args)):
						if pred.args[i] in self.lhs.args[j]:
							self.lhs.args[j]+="|" #We add "|" as an end of string marker"
							self.lhs.args[j]=self.lhs.args[j].replace(pred.args[i]+"Y","X_"+str(count)+"Y")
							self.lhs.args[j]=self.lhs.args[j].replace(pred.args[i]+"X","X_"+str(count)+"X")
							self.lhs.args[j]=self.lhs.args[j].replace(pred.args[i]+"|","X_"+str(count)+"|")
							#We check the next symbol to make sure it's not another numeric symbol
							self.lhs.args[j]=self.lhs.args[j][:-1] #We remove the end of string marker again
							pred.args[i] = "X_"+str(count) #We now replace the right hand side argument with the same variable
							count+=1
		self.mkform()

	def mkform(self):
		"""
		Sets self.form as the string representation of the rule
		"""
		form = ""
		form += self.lhs.mkform()
		form += "->" #By convention, left-hand side and right-hand side are separated by an arrow symbol
		if self.rhs == "eps":
			form += self.rhs #if the right-hand side is already a string, we can append it immediately
		else:
			for pred in self.rhs:
				form += pred.mkform() #By convention, right-hand side predicates are not separated by a symbol
		self.form = form


class Predicate:
	"""
	A class for LCFRS predicates

	Attributes:
	 self.name: a string representing the predicate's name
	 self.args: a list of strings representing the predicate's arguments

	Functions:
	 self.__init__(name,args): set self.name and self.args
	 self.mkform(): return the form of the predicate as a string
	"""
	def __init__(self,name,args):
		"""
		Sets self.name and self.args

		Input:
		 name: a string representing the predicate's name
		 args: a list of strings representing the predicate's arguments
		"""
		self.name = name
		self.args = args

	def mkform(self):
		"""
		Returns the form of the predicate as a string

		Output: the form of the predicate as a string
		"""
		form = ""
		form += self.name
		form += "(" #By convention, the arguments of a predicate appear in between parentheses
		for arg in self.args:
			form += arg
			if arg == self.args[-1]: #if this is the last argument, the next symbol needs to be the closing bracket
				form += ")"
			else:
				form += "," #By convention, arguments are separated by a comma
		return form

class Node:
	"""
	A class for node objects obtained from a treebank in NeGra-format

	Attributes:
	 self.id: a string representing the node id, or the word on the input string in case of a leaf node
	 self.label: a string representing the node label
	 self.mother: a string matching the node lable of this node's mother node
	 self.strID: a list of integers representing the positions of words on the input string dominated by this node
	 self.daughters: a list of node objects that are daughter nodes of this node

	functions:
	 self.init(l): set self.id, self.label and self.mother based on the inputted node as a list of strings, then initialize self.strID and self.daughters as empty lists.
	 self.allDaughtersHaveStrID(): return False if there is a daughter whose string ID is empty, otherwise, return True
	 self.getStrID(): set self.strID based on the string ID's of this node's daughters
	 self.findArgs(): make a list of arguments from self.strID, and return this list
	"""
	def __init__(self,l):
		"""
		Sets self.id, self.label and self.mother based on the inputted node, then initialize self.strID and self.daughters as empty lists

		Input: A node in Negra-Format as a list, obtained from splitting its string on tabs
		"""
		self.id = l[0] #For leaf nodes, this is the word on the input string
		self.label = l[2]
		self.mother = l[5]
		self.strID = []
		self.daughters = []

	def allDaughtersHaveStrID(self):
		"""
		Checks if all daughters have a nonempty string ID

		Output: False if there is a daughter with an empty string ID, True otherwise
		"""
		for daughter in self.daughters:
			if daughter.strID == []:
				return False
		return True

	def getStrID(self):
		"""
		Sets self.strID based on the string ID's of this node's daughters
		"""
		if len(self.daughters) == 1: #if the node has only one daughter, its string ID is the same as its daughter's
			self.strID = self.daughters[0].strID
		else:
			strID = []
			for daughter in self.daughters:
				strID += daughter.strID #if the node has multiple daughters, its string ID is the concatenation of its daughters' string ID's
			strID.sort()  #We sort the string ID so that we can check if there are any gaps later on
			self.strID = strID

	def findArgs(self):
		"""
		Makes a list of arguments from self.strID

		Output: a list of strings, representing the arguments for the LCFRS rule based on this node
		"""
		args = []
		arg = ""
		for i in range(len(self.strID)):
			if i != 0:
				if self.strID[i]-1 != self.strID[i-1]:
				#If the previous integer in the string ID is not one lower than the current one, there is a gap. We introduce an argument boundary at every gap.
					args.append(arg)
					arg = ""
			arg += "Y_"+str(self.strID[i])
		args.append(arg)
		return args

def helptext():
	t="\n A program for inducing an LCFRS-grammar from a treebank in NEGRA format.\n\n Usage: python3 inducer.py [-h] [-o outputfile] inputfile\n\n Input:\n  -h: display this help message and exit\n  -o outputfile: file to write the induced grammar on\n  inputfile: treebank in NeGra format to induce from\n\n Output: the grammar rules of the induced grammar on the outputfile if specified, and otherwise printed in the terminal"
	return(t)

def main(args):
	opts, args = getopt.getopt(args,'ho:')
	f_out = None
	for o, a in opts:
		if o == "-h":
			return(helptext())
		if o == "-o":
			f_out = a
	grammar = Grammar(args[0])
	print("grammar made")
	if f_out == None:
		grammar.pprint()
	else:
		grammar.pwrite(f_out)


if __name__ == '__main__':
    import sys
    sys.exit(main(sys.argv[1:]))