#This is the kind of stuff I've been doing, rather than working on Factor import sys def parseGrammar(file): productions = {} for line in file: line = line.strip().split("#")[0] #strip off comments if line: parts = line.split(":") if len(parts) != 2: print "Grammar is malformed; line missing colon" sys.exit(1) lhs = parts[0].strip() rhs = parts[1].split() if lhs not in productions: productions[lhs] = [] productions[lhs].append(rhs) return productions def constructDictionary(grammar): newproductions = {} for key, value in grammar.items(): newproductions[key] = set([x[0] for x in value]) return newproductions class Branch: def __init__(self, payload, children): self.payload = payload self.children = children def write(self, depth): print " "*depth + self.payload for child in self.children: child.write(depth+1) def withNextChild(self, child): return Branch(self.payload, self.children+[child]) def __repr__(self): return "Branch("+repr(self.payload)+", "+repr(self.children)+")" class Leaf: def __init__(self, category, lexeme): self.category = category self.lexeme = lexeme def write(self, depth): print " "*depth + self.category + " - " + self.lexeme def __repr__(self): return "Leaf("+repr(self.category)+", "+repr(self.lexeme)+")" def printTree(tree): tree.write(0) class Row: def __init__(self, nonterminal, rhs, dot, startIndex, currentIndex, tree): self.nonterminal = nonterminal self.rhs = tuple(rhs) self.dot = dot self.startIndex = startIndex self.currentIndex = currentIndex self.tree = tree def scan(self, lexeme): return Row(self.nonterminal, self.rhs, self.dot+1, self.startIndex, self.currentIndex+1, self.tree.withNextChild(Leaf(self.current(), lexeme))) def predict(self, grammar): i = self.currentIndex nonterminal = self.current() possibilities = grammar[nonterminal] return [Row(nonterminal, possibility, 0, i, i, Branch(nonterminal, [])) for possibility in possibilities] def complete(self, completedRow): return Row(self.nonterminal, self.rhs, self.dot+1, self.startIndex, completedRow.currentIndex, self.tree.withNextChild(completedRow.tree)) def isDone(self): return self.dot == len(self.rhs) def current(self): return self.rhs[self.dot] def hasNext(self, nonterminal): if self.isDone(): return False else: return self.current() == nonterminal #Rows are defined by everything but their tree, so equality and hashcode must be rewritten def __eq__(self, other): return isinstance(other, Row) and self.nonterminal == other.nonterminal \ and self.rhs == other.rhs and self.dot == other.dot \ and self.startIndex == other.startIndex and self.currentIndex == other.currentIndex def __hash__(self): return hash(self.nonterminal) ^ hash(self.rhs) ^ self.dot ^ self.startIndex << 8 ^ self.currentIndex << 16 def __repr__(self): return "Row("+repr(self.nonterminal)+", "+repr(self.rhs)+", "+repr(self.dot)+", "+repr(self.startIndex)+", "+repr(self.currentIndex)+","+repr(self.tree)+")" def parse(grammar, lexicon, words): start = Row('__gamma__', ['S'], 0, 0, 0, Branch('__gamma__', [])) rows = [(set(), set([start]))]+[(set(), set()) for i in range(len(words))] def enqueue(row): i = row.currentIndex if row not in rows[i][0] and row not in rows[i][1]: rows[i][1].add(row) for i in range(len(words)+1): while len(rows[i][1]) != 0: row = rows[i][1].pop() rows[i][0].add(row) if row.isDone(): #Completer for candidate in rows[row.startIndex][0] | rows[row.startIndex][1]: if candidate.hasNext(row.nonterminal): enqueue(candidate.complete(row)) elif row.current() in grammar: #Predictor for candidate in row.predict(grammar): enqueue(candidate) elif row.current() in lexicon: #Scanner if i < len(words) and words[i] in lexicon[row.current()]: candidate = row.scan(words[i]) enqueue(candidate) else: raise Exception("The grammar is ill-formed") for candidate in rows[len(words)][0]: if candidate.nonterminal == '__gamma__' and candidate.isDone(): return candidate.tree.children[0] return None def main(grammarFile, lexiconFile, string): grammar = parseGrammar(grammarFile) lexicon = constructDictionary(parseGrammar(lexiconFile)) tree = parse(grammar, lexicon, string) if tree == None: print "The sentence was not recognized" else: printTree(tree) if __name__ == '__main__': if len(sys.argv) != 4: print "Usage: python %s grammar-file lexicon-file string-to-parse" % sys.argv[0] sys.exit(1) main(open(sys.argv[1]), open(sys.argv[2]), sys.argv[3].split())