''' Authors: Henry Thompson, Bharat Ram Ambati, Ida Szubert Date: 2014-10-01, 2017-10-09 Copyright: This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/): You may re-use, redistribute, or modify this work for non-commercial purposes provided you retain attribution to any previous author(s). ''' from __future__ import division import sys from pprint import pprint from math import log from collections import defaultdict import matplotlib.pyplot as plt import numpy as np #numpy provides useful maths and vector operations from numpy.random import random_sample import nltk from nltk import FreqDist, ConditionalFreqDist from nltk.tag import * from nltk.corpus import dependency_treebank from nltk.tag.hmm import HiddenMarkovModelTagger #This function isn't actually used in the lab, but included to show #a simpler example of a bar chart than plot_distributions below. def plot_histogram(lcPairs): '''plot_histogram is a very general function that takes either a dictionary or a list of label,count pairs (values or counts need to be numbers), and makes a bar plot showing the count for each label. ''' plt.clf() if isinstance(lcPairs,dict): lcPairs=lcPairs.items() #arange() is like range() but returns a numpy array instead of a list x_pos = np.arange(len(lcPairs)) #first arg (x_pos) is the position of left hand side of bar #second arg (counts) is the height of bar (labels,counts)=zip(*lcPairs) plt.bar(x_pos,counts,width=1) #bar width is 1, so put labels at left side + .5 (middle of bar), rotated slightly plt.xticks(x_pos+.5, labels, rotation=50) plt.ylim([0,1.1*max(counts)]) plt.show() def sent_length_distribution(sents): ''' tag_distribution takes tagged sentences extracted using nltk libraries as input and returns a frequency distribution of sentence lengths ''' fd=defaultdict(int) for s in sents: n=len(s) ## students: fill in the rest return fd def tag_distribution(sents): ''' tag_distribution takes tagged sentences extracted using nltk libraries as input and returns a frequency distribution of pos tags ''' fd=defaultdict(int) for s in sents: for (w,t) in s: pass ## students: replace this return fd def word_tag_distribution(sents): ''' word_tag_distribution takes tagged sentences extracted using nltk libraries as input and returns the conditional frequency distribution of word and pos tags ''' # Instead of using defaultdict, do things a step at a time: cfd = {} # For each word,tag tuple in each sentence, # create an fd if necessary, and update the tag count for sent in sents: for (w,t) in sent: fd=cfd.setdefault(w,{}) # either get the value, # or give it an empty dictionary as value if t in fd: # a known key pass # students: replace this else: # not previously seen pass # students: replace this return cfd def unigram_tagger(cfd, bpos, sent): ''' This is a simple pos tagger. It takes conditional frequency distribution (cfd) of word and its tags, and sentence as input and assigns pos tags to the words in that sentence. If a word is seen (present in cfd), it assigns the most frequent tag for that word. For unseen words (not present in cfd), it assigns a default pos tag (in this case common noun "NN"). ''' words = sent.rstrip().split() ## students need to fill in correct function return [ut1(w,cfd,bpos) for w in words] def ut1(word,cfd,bpos): ''' Helper function for unigram_tagger. Look up the most common tag for a single word in cfd, using bpos if it's not in there''' pass # students: replace this with correct function definition # Extracting tagged sentences using NLTK libraries tsents = dependency_treebank.tagged_sents() if (len(sys.argv)<2 or sys.argv[1]!='-q'): try: loaded+=1 except NameError: loaded=1 print("\nFirst tagged sentence:\n", tsents[0]) print("\nFirst tuple in the sentence:", tsents[0][0]) print("\nWord in the first tuple:", tsents[0][0][0]) print("\nTag in the first tuple:", tsents[0][0][1]) print("\nTotal Number of sentences:", len(tsents)) print("\nAverage Sentence Length:", (sum([len(sent) for sent in tsents]))/len(tsents))