Paste: Earley parser

Author: littledan
Mode: python
Date: Thu, 14 Oct 2010 22:43:20
Plain Text |
#This is the kind of stuff I've been doing, rather than working on Factor

import sys

def parseGrammar(file):
    productions = {}
    for line in file:
        line = line.strip().split("#")[0] #strip off comments
        if line:
            parts = line.split(":")
            if len(parts) != 2:
                print "Grammar is malformed; line missing colon"
                sys.exit(1)
            lhs = parts[0].strip()
            rhs = parts[1].split()
            if lhs not in productions:
                productions[lhs] = []
            productions[lhs].append(rhs)
    return productions

def constructDictionary(grammar):
    newproductions = {}
    for key, value in grammar.items():
	newproductions[key] = set([x[0] for x in value])
    return newproductions

class Branch:
    def __init__(self, payload, children):
        self.payload = payload
        self.children = children
    def write(self, depth):
        print "  "*depth + self.payload
        for child in self.children:
            child.write(depth+1)
    def withNextChild(self, child):
        return Branch(self.payload, self.children+[child])
    def __repr__(self):
        return "Branch("+repr(self.payload)+", "+repr(self.children)+")"

class Leaf:
    def __init__(self, category, lexeme):
        self.category = category
        self.lexeme = lexeme
    def write(self, depth):
        print "  "*depth + self.category + " - " + self.lexeme
    def __repr__(self):
        return "Leaf("+repr(self.category)+", "+repr(self.lexeme)+")"

def printTree(tree):
    tree.write(0)

class Row:
    def __init__(self, nonterminal, rhs, dot, startIndex, currentIndex, tree):
        self.nonterminal = nonterminal
        self.rhs = tuple(rhs)
        self.dot = dot
        self.startIndex = startIndex
        self.currentIndex = currentIndex
        self.tree = tree
    def scan(self, lexeme):
        return Row(self.nonterminal, self.rhs, self.dot+1, self.startIndex, self.currentIndex+1, self.tree.withNextChild(Leaf(self.current(), lexeme)))
    def predict(self, grammar):
        i = self.currentIndex
        nonterminal = self.current()
        possibilities = grammar[nonterminal]
        return [Row(nonterminal, possibility, 0, i, i, Branch(nonterminal, [])) for possibility in possibilities]
    def complete(self, completedRow):
        return Row(self.nonterminal, self.rhs, self.dot+1, self.startIndex, completedRow.currentIndex, self.tree.withNextChild(completedRow.tree))
    def isDone(self):
        return self.dot == len(self.rhs)
    def current(self):
        return self.rhs[self.dot]
    def hasNext(self, nonterminal):
        if self.isDone(): return False
        else: return self.current() == nonterminal
    #Rows are defined by everything but their tree, so equality and hashcode must be rewritten
    def __eq__(self, other):
        return isinstance(other, Row) and self.nonterminal == other.nonterminal \
            and self.rhs == other.rhs and self.dot == other.dot \
            and self.startIndex == other.startIndex and self.currentIndex == other.currentIndex
    def __hash__(self):
        return hash(self.nonterminal) ^ hash(self.rhs) ^ self.dot ^ self.startIndex << 8 ^ self.currentIndex << 16
    def __repr__(self):
        return "Row("+repr(self.nonterminal)+", "+repr(self.rhs)+", "+repr(self.dot)+", "+repr(self.startIndex)+", "+repr(self.currentIndex)+","+repr(self.tree)+")"

def parse(grammar, lexicon, words):
    start = Row('__gamma__', ['S'], 0, 0, 0, Branch('__gamma__', []))
    rows = [(set(), set([start]))]+[(set(), set()) for i in range(len(words))]
    def enqueue(row):
        i = row.currentIndex
        if row not in rows[i][0] and row not in rows[i][1]:
            rows[i][1].add(row)
    for i in range(len(words)+1):
        while len(rows[i][1]) != 0:
            row = rows[i][1].pop()
            rows[i][0].add(row)
            if row.isDone():
                #Completer
                for candidate in rows[row.startIndex][0] | rows[row.startIndex][1]:
                    if candidate.hasNext(row.nonterminal):
                        enqueue(candidate.complete(row))
            elif row.current() in grammar:
                #Predictor
                for candidate in row.predict(grammar):
                    enqueue(candidate)
            elif row.current() in lexicon:
                #Scanner
                if i < len(words) and words[i] in lexicon[row.current()]:
                    candidate = row.scan(words[i])
                    enqueue(candidate)
            else:
                raise Exception("The grammar is ill-formed")
    for candidate in rows[len(words)][0]:
        if candidate.nonterminal == '__gamma__' and candidate.isDone():
            return candidate.tree.children[0]
    return None
    

def main(grammarFile, lexiconFile, string):
    grammar = parseGrammar(grammarFile)
    lexicon = constructDictionary(parseGrammar(lexiconFile))
    tree = parse(grammar, lexicon, string)
    if tree == None:
        print "The sentence was not recognized"
    else:
    	printTree(tree)

if __name__ == '__main__':
    if len(sys.argv) != 4:
        print "Usage: python %s grammar-file lexicon-file string-to-parse" % sys.argv[0]
        sys.exit(1)
    main(open(sys.argv[1]), open(sys.argv[2]), sys.argv[3].split())

New Annotation

Summary:
Author:
Mode:
Body: