''' Authors: Luke Shrimpton, Sharon Goldwater, Ida Szubert Date: 2014-11-01, 2017-11-05 Copyright: This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/): You may re-use, redistribute, or modify this work for non-commercial purposes provided you retain attribution to any previous author(s). ''' from __future__ import division; from math import log; from pylab import mean; #This version of the function substitutes counts directly into #the equation for PMI, cancelling N. def PMI(c_xy, c_x, c_y, N): # Computes PMI(x, y) where # c_xy is the number of times x co-occurs with y # c_x is the number of times x occurs. # c_y is the number of times y occurs. # N is the number of observations. return log(N*c_xy/(c_x*c_y), 2) #Many of you did this version instead, which is less #efficient computationally but probably easier to understand def PMI2(c_xy, c_x, c_y, N): # Computes PMI(x, y) where # c_xy is the number of times x co-occurs with y # c_x is the number of times x occurs. # c_y is the number of times y occurs. # N is the number of observations. p_x = c_x/N p_y = c_y/N p_xy = c_xy/N return log(p_xy/(p_x*p_y), 2) #Do a simple error check using value computed by hand if(PMI(2,4,3,12) != 1): # these numbers are from our y,z example print("Warning: PMI is incorrectly defined") else: print("PMI check passed") # List of positive words: pos_words = ["love","like","great"]; # List of negative words: neg_words = ["hate","bad","terribl"]; # List of target words: targets = ["@justinbieber","food", "politician","husband","wife","self","kid","child","son","daughter"]; # Collect all words of interest and store their term ids: all_words = set(pos_words+neg_words+targets); all_wids = set([word2wid[x] for x in all_words]); # Define the data structures used to store the counts: o_counts = {}; # Occurrence counts co_counts = {}; # Co-occurrence counts # Load the data: fp = open("/afs/inf.ed.ac.uk/group/teaching/anlp/lab8/counts"); lines = fp.readlines(); N = float(lines[0]); # First line contains the number of observations. for line in lines[1:]: line = line.strip().split("\t"); wid0 = int(line[0]); if(wid0 in all_wids): # Only get/store counts for words we are interested in o_counts[wid0] = int(line[1]); # Store occurence counts co_counts[wid0] = dict([[int(y) for y in x.split(" ")] for x in line[2:]]); # Store co-occurence counts print("positive words are: ", pos_words) print("negative words are: ", neg_words) for target in targets: targetid = word2wid[target] target_count = o_counts[targetid]; # Extract the count of the target word posPMIs = [] negPMIs = [] # compute PMI between target and each positive word, and # add it to the list of positive PMI values for pos in pos_words: pos_wid = word2wid[pos]; # Extract the word id for the positive word. pos_count = o_counts[pos_wid]; # Extract the occurrence counts for the positive word. if(pos_wid in co_counts[targetid]): # Check if the words actually co-occur cc = co_counts[targetid][pos_wid]; # Extract the co-occurrence counts for the target and positive word pair. posPMIs.append(PMI(cc,target_count,pos_count,N)); # Compute PMI and append to the list else: pass; # No co-occurence, ignore the word. # same for negative words for neg in neg_words: neg_wid = word2wid[neg]; # Extract the word id for the negative word. neg_count = o_counts[neg_wid]; # Extract the occurrence counts for the negative word. if(neg_wid in co_counts[targetid]): # Check if the words actually co-occur cc = co_counts[targetid][neg_wid]; # Extract the co-occurrence counts for the target and negative word pair. negPMIs.append(PMI(cc,target_count,neg_count,N)); # Compute PMI and append to the list else: pass; # No co-occurence, ignore the word. #uncomment the following line when posPMIs and negPMIs are no longer empty. print(target, ": ", mean(posPMIs), "(pos), ", mean(negPMIs), "(neg)")