import operator import nltk from nltk.probability import FreqDist,MLEProbDist from nltk.corpus import gutenberg from nltk import bigrams from nltk.corpus import stopwords from math import log from pprint import pprint class FasterMLEProbDist(MLEProbDist): '''Speed up prob lookup for large sample sizes''' def __init__(self,freqdist): self._N=freqdist.N() if self._N == 0: self._empty = True else: self._empty = False self._pq=float(self._N) MLEProbDist.__init__(self,freqdist) def prob(self, sample): '''use cached quotient for division''' if self._empty: return 0 else: return float(self._freqdist[sample]) / self._pq sentences=gutenberg.sents('melville-moby_dick.txt') stopwords = stopwords.words('english') def Filter1(word): return word.isalpha() def Filter2(word): return (word.isalpha() and not(word.lower() in stopwords)) #TODO # Function for building the data set # Input: a list of sentences, Filter function # Build the list of bigrams and unigrams from the sentences and return this data def BuildData(sentences,Filter): #unigrams_list = #bigrams_list return bigrams_list, unigrams_list #TODO: using the data build the probability distribution over bigrams and unigrams using FasterMLEProbDist def ex1(bigrams, unigrams): #TODO build the frequency distribution over bigrams and unigrams #bigramFreqDist = #unigramFreqDist = #TODO build the probability distribuition from the above frequency distributions using the FasterMLEProbDist estimator #bigramProbDist = #unigramProbist = return bigramProbDist, unigramProbist def test1(): bigrams, unigrams = BuildData(sentences,Filter1) bigramProbDist1, unigramProbist1 = ex1(bigrams, unigrams) print "type: ",type(bigramProbDist1) # print "type: ",type(unigramProbist1) # MLESorted = bigramProbDist1.freqdist().most_common(30) print "Using filter 1:",pprint(MLESorted) print "type: \n",type(MLESorted) # bigrams, unigrams = BuildData(sentences,Filter2) bigramProbDist, unigramProbist = ex1(bigrams, unigrams) MLESorted = bigramProbDist.freqdist().most_common()[:30] print "Using filter 2:",pprint(MLESorted) print "\n" return bigramProbDist1, unigramProbist1 # TEST EXERCISE 1 - return values will be used for exercise 2 #bigramProbDist, unigramProbDist = test1() #TODO: for each sample in the bigramProbDist compute the PMI and add {sample,PMI} pair to the PMI dict #input: bigram and unigram distribution of type nltk.probability.FasterMLEProbDist #output: PMI dict, PMIsorted list def ComputePMI(bpd, upd): #PMIs = #TODO: make a list of (sample,PMI) pairs for each sample in bpd #list of (bigrams,PMI) sorted according to the PMI score PMIsorted = sorted(PMIs, key=operator.itemgetter(1), reverse=True) return dict(PMIs), PMIsorted def test2(bpd,upd): print "type: ",type(bpd) # print "type: ",type(upd) # PMIs, PMIsorted = ComputePMI(bpd, upd) print "type: ", type(PMIs) # print "type: ", type(PMIsorted) # print "sperm whale %0.2f" % PMIs[("sperm","whale")] print "of the %0.2f" % PMIs[("of","the")] print "old man %0.2f" % PMIs[("old","man")] #comment why it's not as expected close to 0 -> because not enough data print "one side %0.2f" % PMIs[("one","side")] print "\n" bcount=bpd.freqdist() for pair in PMIsorted[:10]: print "%s\t%0.2f\t%d" % (pair[0], pair[1], bcount[pair[0]]) n=0 for pair in PMIsorted: if n==10: break if bcount[pair[0]]>30: print "%s\t%0.2f\t%d" % (pair[0], pair[1], bcount[pair[0]]) n+=1 print return PMIsorted # TEST EXERCISE 2 - return values will be used for exercise 3 #PMIsorted = test2(bigramProbDist, unigramProbDist) #TODO to eliminate the problem of low frequency put a threshold on the bigram frequency def ex3(PMIsorted,bpd): #TODO we need a freqdist from which to pull bigram frequencies -- it's in bpd #bcount = #TODO Return a list of bigrams and their corresponding PMI for bigrams composed of words with frequency greater than 30 #high_freq_PMIsorted = return high_freq_PMIsorted def test3(PMIsorted, bpd): high_freq = ex3(PMIsorted, bpd) print "\nTop 20 by PMI where pair count>30" print "%s\t%s\t%s"%('PMI','n','pair') bcount = bpd.freqdist() for pair in high_freq[:20]: print "%0.2f\t%d\t%s" % (pair[1], bcount[pair[0]], pair[0]) print "\nBottom 20 by PMI where pair count>30" for pair in high_freq[-20:]: print "%s\t%0.2f\t%d" % (pair[0], pair[1], bpd.freqdist()[pair[0]]) # TEST EXERCISE 3 #test3(PMIsorted,bigramProbDist)