''' Author: Sharon Goldwater Date: 2014-09-01, updated 2017-09-15 for Python 3 Copyright: This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/): You may re-use, redistribute, or modify this work for non-commercial purposes provided you retain attribution to any previous author(s). This file defines the functions used in Lab 2, which can be used to count words in files, make a Zipf plot or histogram, and compute and plot MLU. ''' #This line imports a special data type called defaultdict, which you #can use like a dictionary except that it assigns a default value to #any item that has not yet been accessed/assigned. To use it, you need #to provide the data type that will be stored in the defaultdict, so #that it knows what default value to use. We will store integers, #which have default value 0. from collections import defaultdict #This line allows us to use various plotting functions (see code) import matplotlib.pyplot as plt def get_word_counts(fnames): ''' get_word_counts takes a list of filenames (strings) as arguments and returns a defaultdict which contains the count of each unique word in those files. The current implementation only counts words spoken by the mother, and assumes the files will be formatted as in the Providence corpus of CHILDES. ''' word_counts = defaultdict(int) #defaultdict of integers (see note above) for fname in fnames: print('Opening ', fname) with open(fname,'r') as infile: for line in infile: if line[0:4] == '*MOT': #look for lines spoken by mother line = line.strip() #strip off any leading or final whitespace tokens = line.split() #get a list of the word tokens (split on whitespace) ### The following line removes *MOT, final punctuation, and timestamp. tokens = tokens[1:-2] for tok in tokens: ### We can also skip over other punctuation. The next line is a very ### basic version, a more sophisticated one might use regular expressions. if tok != ',': word_counts[tok] += 1 #add count to dictionary. # note that word_counts[tok] will default to 0 # if not previously accessed because we used a # defaultdict return word_counts def zipf_plot(word_counts, maxRank=0): ''' zipf_plot takes a dictionary of word counts as an argument and makes a two scatter plots of the rank vs. frequency of words (aka Zipf plots). One plot has log-log axes, the other does not. The second (optional) argument can be used to plot only the top ranked words (up to maxRank). If maxRank is 0 (the default), all words will be plotted. ''' #We will use the 'plot' function of matplotlib to create a scatter #plot. 'plot' requires two lists, X and Y, as its two required #arguments, corresponding to the x positions and y positions of #the data points, and will plot points at (X[0], Y[0]), #(X[1],Y[1]), etc. Here we want to plot rank versus frequency, so #X should be the ranks, and Y should be the frequencies at each #rank. The third (optional) argument to 'plot' specifies the color #and style of the data points (or lines) to use. Here we use black #('k') dots ('.') #Note: All the functions preceded by 'plt' come from the #matplotlib library. There are actually two interfaces for this #library - the one we use here looks a lot like Matlab, which some #people may already be familiar with. There is also a more #object-oriented interface which you can look up if interested but #we won't use it here. num_wds = len(word_counts) # total number of word types #First make the list of sorted frequencies sorted_counts = sorted(word_counts.values(), reverse=True) # Now make the list of ranks, which is just the list of # integers from 1 to the number of words. x_pos = range(1, num_wds+1) # gives a list from 1 to (number of words) if maxRank > 0: #only plot words up to rank maxRank x_pos = x_pos[:maxRank] sorted_counts = sorted_counts[:maxRank] #the following commands create the plots plt.clf() # clear previous plots (if any) plt.subplot(1,2,1) #specify the first plot in a 1x2 set of plots plt.plot(x_pos, sorted_counts, 'k.') # create the scatter plot if maxRank > 0: #set xticks to be the words themselves plt.xticks(range(maxRank),sorted(word_counts, key=word_counts.get, reverse=True)[:maxRank],rotation=50) plt.xlabel('Word') else: plt.xlabel('Rank') plt.ylabel('Frequency') plt.title('Words from *MOT utterances, linear axes') plt.subplot(1,2,2) #specify the second plot in a 1x2 set of plots plt.xscale('log') #set x axis to log scale. Must do *before* creating plot plt.yscale('log') #set y axis to log scale. plt.plot(x_pos, sorted_counts, 'k.') # create the scatter plot plt.xlabel('Rank') plt.ylabel('Frequency') plt.title('Words from *MOT utterances, log axes') plt.show() #display the set of plots def get_mlus(fnames): ''' get_mlus takes a list of filenames as an argument and returns a list of MLU values, one for each file in the list, where each MLU is computed based on the child's utterances in the file. It assumes the file format used in the Providence corpus. ''' mlus = [] for fname in sorted(fnames): print('Opening ', fname) nutts = 0 ntoks = 0 with open(fname,'r') as infile: for line in infile: if line[0:4] == '*CHI': # alternatively: if line.startswith('*CHI'): line = line.strip() tokens = line.split() assert len(tokens) >= 3 # code will exit if this check fails ntoks += len(tokens)-3 nutts += 1 assert nutts > 0 # code will exit if this check fails mlus.append(ntoks/nutts) return mlus def plot_mlus(mlus, labels): ''' plot_mlus takes a list of MLU values and a list of labels as arguments. It makes a line plot of the MLU values, with each value labelled on the x-axis with the corresponding label. ''' plt.clf() ### students must fill in the rest x_pos = range(len(mlus)) plt.plot(x_pos, mlus) plt.xticks(x_pos, labels, rotation = 90) plt.ylim(0,6) plt.ylabel('Mean length of utterance') ### to here. plt.show() #### main body of code #### import sys if len(sys.argv) < 2: print('You must provide at least one filename argument') sys.exit(1) #exit program with error code fnames = sys.argv[1:] #get input arguments from shell command line ###for first part of lab word_counts = get_word_counts(fnames) zipf_plot(word_counts) ###for second part of lab short_fnames = [fname[-9:-4] for fname in fnames] ### fill in below to generate MLU plots: mlus = get_mlus(fnames) plot_mlus(mlus, short_fnames)