Paste: Earley parser
Author: | littledan |
Mode: | python |
Date: | Thu, 14 Oct 2010 22:43:20 |
Plain Text |
import sys
def parseGrammar(file):
productions = {}
for line in file:
line = line.strip().split("#")[0]
if line:
parts = line.split(":")
if len(parts) != 2:
print "Grammar is malformed; line missing colon"
sys.exit(1)
lhs = parts[0].strip()
rhs = parts[1].split()
if lhs not in productions:
productions[lhs] = []
productions[lhs].append(rhs)
return productions
def constructDictionary(grammar):
newproductions = {}
for key, value in grammar.items():
newproductions[key] = set([x[0] for x in value])
return newproductions
class Branch:
def __init__(self, payload, children):
self.payload = payload
self.children = children
def write(self, depth):
print " "*depth + self.payload
for child in self.children:
child.write(depth+1)
def withNextChild(self, child):
return Branch(self.payload, self.children+[child])
def __repr__(self):
return "Branch("+repr(self.payload)+", "+repr(self.children)+")"
class Leaf:
def __init__(self, category, lexeme):
self.category = category
self.lexeme = lexeme
def write(self, depth):
print " "*depth + self.category + " - " + self.lexeme
def __repr__(self):
return "Leaf("+repr(self.category)+", "+repr(self.lexeme)+")"
def printTree(tree):
tree.write(0)
class Row:
def __init__(self, nonterminal, rhs, dot, startIndex, currentIndex, tree):
self.nonterminal = nonterminal
self.rhs = tuple(rhs)
self.dot = dot
self.startIndex = startIndex
self.currentIndex = currentIndex
self.tree = tree
def scan(self, lexeme):
return Row(self.nonterminal, self.rhs, self.dot+1, self.startIndex, self.currentIndex+1, self.tree.withNextChild(Leaf(self.current(), lexeme)))
def predict(self, grammar):
i = self.currentIndex
nonterminal = self.current()
possibilities = grammar[nonterminal]
return [Row(nonterminal, possibility, 0, i, i, Branch(nonterminal, [])) for possibility in possibilities]
def complete(self, completedRow):
return Row(self.nonterminal, self.rhs, self.dot+1, self.startIndex, completedRow.currentIndex, self.tree.withNextChild(completedRow.tree))
def isDone(self):
return self.dot == len(self.rhs)
def current(self):
return self.rhs[self.dot]
def hasNext(self, nonterminal):
if self.isDone(): return False
else: return self.current() == nonterminal
def __eq__(self, other):
return isinstance(other, Row) and self.nonterminal == other.nonterminal \
and self.rhs == other.rhs and self.dot == other.dot \
and self.startIndex == other.startIndex and self.currentIndex == other.currentIndex
def __hash__(self):
return hash(self.nonterminal) ^ hash(self.rhs) ^ self.dot ^ self.startIndex << 8 ^ self.currentIndex << 16
def __repr__(self):
return "Row("+repr(self.nonterminal)+", "+repr(self.rhs)+", "+repr(self.dot)+", "+repr(self.startIndex)+", "+repr(self.currentIndex)+","+repr(self.tree)+")"
def parse(grammar, lexicon, words):
start = Row('__gamma__', ['S'], 0, 0, 0, Branch('__gamma__', []))
rows = [(set(), set([start]))]+[(set(), set()) for i in range(len(words))]
def enqueue(row):
i = row.currentIndex
if row not in rows[i][0] and row not in rows[i][1]:
rows[i][1].add(row)
for i in range(len(words)+1):
while len(rows[i][1]) != 0:
row = rows[i][1].pop()
rows[i][0].add(row)
if row.isDone():
for candidate in rows[row.startIndex][0] | rows[row.startIndex][1]:
if candidate.hasNext(row.nonterminal):
enqueue(candidate.complete(row))
elif row.current() in grammar:
for candidate in row.predict(grammar):
enqueue(candidate)
elif row.current() in lexicon:
if i < len(words) and words[i] in lexicon[row.current()]:
candidate = row.scan(words[i])
enqueue(candidate)
else:
raise Exception("The grammar is ill-formed")
for candidate in rows[len(words)][0]:
if candidate.nonterminal == '__gamma__' and candidate.isDone():
return candidate.tree.children[0]
return None
def main(grammarFile, lexiconFile, string):
grammar = parseGrammar(grammarFile)
lexicon = constructDictionary(parseGrammar(lexiconFile))
tree = parse(grammar, lexicon, string)
if tree == None:
print "The sentence was not recognized"
else:
printTree(tree)
if __name__ == '__main__':
if len(sys.argv) != 4:
print "Usage: python %s grammar-file lexicon-file string-to-parse" % sys.argv[0]
sys.exit(1)
main(open(sys.argv[1]), open(sys.argv[2]), sys.argv[3].split())
New Annotation