from __future__ import division import sys from pprint import pprint from collections import defaultdict import nltk from nltk.corpus import treebank from nltk import ConditionalFreqDist, Nonterminal, FreqDist from lab5_fix import * from nltk import InsideChartParser from nltk.parse.chart import Chart,AbstractChartRule from nltk.tree import Tree,ProbabilisticTree,_child_names from nltk.parse.pchart import ProbabilisticFundamentalRule,ProbabilisticBottomUpInitRule,ProbabilisticTreeEdge,ProbabilisticLeafEdge from nltk.parse.pchart import SingleEdgeProbabilisticFundamentalRule from math import log # Renamed between 3.0 and 3.0.4 :-( if not(hasattr(Chart,'pretty_format_edge')): Chart.pretty_format_edge=Chart.pp_edge # nltk.parse.pchart is fundamentally broken, because it adds edges directly # into the chart, where the fr can see them whether or not they've come # out of the agenda or not. # The least-bad fix from outside I can come up with is implemented here: # add a boolean var called 'pending' which is true by default, only set to # false when the edge comes off the agenda, and when true causes it # to be ignored by fr # Possible bug? Even pending edges _are_ checked for when testing for # redundancy (i.e. Chart.insert is _not_ changed), but that means any # failure of best-first might cause a cheaper edge to be discarded # because an earlier, but still pending, identical-but-more expensive # edge is in the chart. nltk.chart.EdgeI.pending=True def productions_with_left_context(self,lpos=0,leaves=None): """ Generate the productions that correspond to the non-terminal nodes of the tree, with their left-context word (or None), as pairs of word and Production. For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the form P -> C1 C2 ... Cn and the word to the left of C1 >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.productions() [(None, S -> NP VP), (None, NP -> D N), (None, D -> 'the') ('the', N -> 'dog'), ('dog', VP -> V NP), ('dog', V -> 'chased'), ('dog', NP -> D N), ('chased', D -> 'the'), ('the', N -> 'cat')] :rtype: list(Production) """ if leaves is None: leaves=self.leaves() #if not isinstance(self._label, string_types): # raise TypeError('Productions can only be generated from trees having node labels that are strings') if lpos>0: lc=leaves[lpos-1] else: lc=None prods = [(lc,Production(Nonterminal(self._label), _child_names(self)))] for child in self: if isinstance(child, Tree): prods += child.productions_with_left_context(lpos,leaves) # could be much smarter lpos+=len(child.leaves()) else: lpos+=1 return prods Tree.productions_with_left_context=productions_with_left_context def production_distribution(psents): """ Creates a frequency distribution of lexical and non-lexical (grammatical) productions """ prod_dict = defaultdict(int) for psent in psents: for production in psent.productions(): prod_dict[production]+=1 return prod_dict def nt_counts(prod_dict): '''Create a dictionary of non-terminals and their counts''' nt_dict=defaultdict(int) for (rule,count) in prod_dict.items(): nt_dict[rule.lhs()]+=count return nt_dict def cost(prob): return 0.0 if prob==1.0 else -log(prob,2) def production_cost(production,lhs_counts,production_counts): pcount=production_counts[production] ntcount=lhs_counts[production.lhs()] return cost(float(pcount)/float(ntcount)) def get_costed_productions(psents): """ Creates costed/weighted productions from a given list of parsed sentences.""" prods_dict = production_distribution(psents) prods_nt_counts=nt_counts(prods_dict) costed_prods=[CostedProduction(p.lhs(),p.rhs(),production_cost(p, prods_nt_counts, prods_dict)) for p in prods_dict.keys()] return costed_prods class BetterPBPR(AbstractChartRule): NUM_EDGES=1 def apply(self, chart, grammar, edge): if edge.is_incomplete(): return for prod in grammar.productions(): if edge.lhs() == prod.rhs()[0]: # check for X -> X if prod.lhs()==edge.lhs() and len(prod.rhs())==1: continue new_edge = ProbabilisticTreeEdge.from_production(prod, edge.start(), prod.prob()) if chart.insert(new_edge, ()): yield new_edge class BetterSEPFR(AbstractChartRule): NUM_EDGES=1 _fundamental_rule = ProbabilisticFundamentalRule() def apply(self, chart, grammar, edge1): fr = self._fundamental_rule if edge1.is_incomplete(): # edge1 = left_edge; edge2 = right_edge for edge2 in chart.select(start=edge1.end(), is_complete=True, lhs=edge1.nextsym()): if edge2.pending: continue for new_edge in fr.apply(chart, grammar, edge1, edge2): yield new_edge else: # edge2 = left_edge; edge1 = right_edge for edge2 in chart.select(end=edge1.start(), is_complete=False, nextsym=edge1.lhs()): if edge2.pending: continue for new_edge in fr.apply(chart, grammar, edge2, edge1): yield new_edge class BetterICP(InsideChartParser): '''Implement a more user-friendly InsideChartParser, which will show intermediate results, and quit after finding a specified number of parses''' def parse(self, tokens, notify=True, max=0): '''Run a probabilistic parse of tokens. If notify is true, display each complete parse as it is found If max>0, quit after finding that many parses''' self._grammar.check_coverage(tokens) chart = Chart(list(tokens)) chart._trace=self._trace # Bad form. . . grammar = self._grammar start = grammar.start() prod_probs = {} # Chart parser rules. bu_init = ProbabilisticBottomUpInitRule() bu = BetterPBPR() # avoid infinite numbers of parses :-( fr = BetterSEPFR() # don't look at pending edges # Our queue queue = [] # Initialize the chart. for edge in bu_init.apply(chart, grammar): if self._trace > 1: print(' %-50s [%.4g]' % (chart.pretty_format_edge(edge,width=2), cost(edge.prob()))) queue.append(edge) found = 0 while len(queue) > 0 and (max<1 or found 0: print(' %-50s [%.4g]' % (chart.pretty_format_edge(edge,width=2), cost(edge.prob()))) if (edge.start()==0 and edge.end()==chart._num_leaves and edge.lhs()==start and edge.is_complete()): if len(prod_probs)==0: for prod in grammar.productions(): prod_probs[prod.lhs(), prod.rhs()] = prod.prob() if notify: print("****") for tree in chart.trees(edge, tree_class=ProbabilisticTree, complete=True): self._setprob(tree, prod_probs) print(tree, '%.4g (%.4g)'%(cost(tree.prob()),cost(edge.prob()))) #print(tree) print("****") found+=1 # Apply BU & FR to it. queue.extend(fr.apply(chart, grammar, edge)) queue.extend(bu.apply(chart, grammar, edge)) # Get a list of complete parses. parses = list(chart.parses(grammar.start(), ProbabilisticTree)) if not notify: for parse in parses: self._setprob(parse,prod_probs) # Sort by probability parses.sort(reverse=True, key=lambda tree: tree.prob()) if notify: print("%s total parses found"%found) return iter(parses) def _prune(self, queue, chart): """ Discard items in the queue if the queue is longer than the beam.""" if len(queue) > self.beam_size: split = len(queue)-self.beam_size if self._trace > 2: for edge in queue[:split]: print(' %-50s [%.4g DISCARDED]' % (chart.pretty_format_edge(edge,2), cost(edge.prob()))) del queue[:split] def beam(self,width): self.beam_size=width