import nltk

#import brown corpus
from nltk.corpus import brown

# module for training a Hidden Markov Model and tagging sequences
from nltk.tag.hmm import HiddenMarkovModelTagger

# module for computing a Conditional Frequency Distribution
from nltk.probability import ConditionalFreqDist

# module for computing a Conditional Probability Distribution
from nltk.probability import ConditionalProbDist

# module for computing a probability distribution with the Maximum Likelihood Estimate
from nltk.probability import MLEProbDist

import operator
import random

############# INTRO POS #################

def intro():
  # NLTK provides corpora tagged with part-of-speech (POS) information and some tools to access this information
  # The Penn Treebank tagset is commonly used for English
  nltk.help.upenn_tagset()

  # We can retrieve the tagged sentences in the Brown corpus by calling the tagged_sents() function
  tagged_sentences = brown.tagged_sents(categories= 'news')
  print "Sentence tagged with Penn Treebank POS labels:"
  print tagged_sentences[42]

   # We can access the Universal tags by changing the tagset argument
  tagged_sentences_universal = brown.tagged_sents(categories= 'news', tagset='universal')
  print "Sentence tagged with Universal POS:"
  print tagged_sentences_universal[42]

# Comment to hide intro
intro()


############# EXERCISE 1 #################
# Solution for exercise 1
# Input: genre (string), tagset (string)
# Output: number_of_tags (int), top_tags (list of string)


# get the number of tags found in the corpus
# compute the Frequency Distribution of tags

def ex1(genre,tagset):
  
  # get the tagged words from the corpus
  tagged_words = brown.tagged_words(categories= genre, tagset=tagset)
  
  # TODO: build a list of the tags associated with each word
  #tags =
  
  # TODO: using the above list compute the Frequency Distribution of tags in the corpus
  # hint: use nltk.FreqDist()
  #tagsFDist =
  
  # TODO: retrieve the total number of tags in the tagset
  #number_of_tags =
  
  #TODO: retrieve the top 10 most frequent tags
  #top_tags =
  return (number_of_tags,top_tags)


def test_ex1():
  print "Tag FreqDist for news:"
  print ex1('news',None)

  print "Tag FreqDist for science_fiction:"
  print ex1('science_fiction',None)

  # Do the same thing for a different tagset: Universal

  print "Tag FreqDist for news with Universal tagset:"
  print ex1('news','universal')

  print "Tag FreqDist for science_fiction with Universal tagset:"
  print ex1('science_fiction','universal')

### Uncomment to test exerise 1
# Let's look at the top tags for different genre and tagsets
#  and observe the differences
#test_ex1()

############# EXERCISE 2 #################
# Solution for exercise 2
# Input: sentence (list of string), size (<4600)
# Output: hmm_tagged_sentence (list of tuples), tagger (HiddenMarkovModelTagger)

# hint: use the help on HiddenMarkovModelTagger to find out how to train, tag and evaluate the HMM tagger
def ex2(sentence, size):
  
  tagged_sentences = brown.tagged_sents(categories= 'news')
  
  # set up the training data
  train_data = tagged_sentences[-size:]
  
  # set up the test data
  test_data = tagged_sentences[:100]

  # TODO: train a HiddenMarkovModelTagger, using the train() method
  #tagger =

  # TODO: using the hmm tagger tag the sentence
  #hmm_tagged_sentence =
  
  # TODO: using the hmm tagger evaluate on the test data
  #eres =

  return (tagger, hmm_tagged_sentence,eres)


def test_ex2():
  tagged_sentences = brown.tagged_sents(categories= 'news')
  words = [tp[0] for tp in tagged_sentences[42]]
  (tagger, hmm_tagged_sentence, eres ) = ex2(words,500)
  print "Sentenced tagged with nltk.HiddenMarkovModelTagger:"
  print hmm_tagged_sentence
  print "Eval score:"
  print eres

  (tagger, hmm_tagged_sentence, eres ) = ex2(words,3000)
  print "Sentenced tagged with nltk.HiddenMarkovModelTagger:"
  print hmm_tagged_sentence
  print "Eval score:"
  print eres

### Uncomment to test exerise 2
#Look at the tagged sentence and the accuracy of the tagger. How does the size of the training set affect the accuracy?
#test_ex2()


############# EXERCISE 3 #################
# Solution for exercise 3
# Input: tagged_words (list of tuples)
# Output: emission_FD (ConditionalFreqDist), emission_PD (ConditionalProbDist), p_NN (float), p_DT (float)


# in the previous labs we've seen how to build a freq dist
# we need conditional distributions to estimate the transition and emission models
# in this exerise we estimate the emission model
def ex3(tagged_words):

  # TODO: prepare the data
  # the data object should be a list of tuples of conditions and observations
  # in our case the tuples will be of the form (tag,word) where words are lowercased
  #data =

  # TODO: compute a Conditional Frequency Distribution for words given their tags using our data
  #emission_FD =
  
  # TODO: return the top 10 most frequent words given the tag NN
  #top_NN =
  
  # TODO: Compute the Conditional Probability Distribution using the above Conditional Frequency Distribution. Use MLEProbDist estimator.
  #emission_PD =
  
  # TODO: compute the probabilities of P(year|NN) and P(year|DT)
  #p_NN =
  #p_DT =
  
  return (emission_FD, top_NN, emission_PD, p_NN, p_DT)


def test_ex3():
  tagged_words = brown.tagged_words(categories='news')
  (emission_FD, top_NN, emission_PD, p_NN, p_DT) = ex3(tagged_words)
  print "Frequency of words given the tag *NN*: ", top_NN
  print "P(year|NN) = ", p_NN
  print "P(year|DT) = ", p_DT

### Uncomment to test exerise 3
#Look at the estimated probabilities. Why is P(year|DT) = 0 ? What are the problems with having 0 probabilities and what can be done to avoid this?
#test_ex3()

############# EXERCISE 4 #################
# Solution for exercise 4
# Input: tagged_sentences (list)
# Output: emission_FD (ConditionalFreqDist), emission_PD (ConditionalProbDist), p_VBD_NN, p_DT_NN

# compute the transition probabilities
# the probabilties of a tag at position i+1 given the tag at position i
def ex4(tagged_sentences):
  
  # TODO: prepare the data
  # the data object should be an array of tuples of conditions and observations
  # in our case the tuples will be of the form (tag_(i),tag_(i+1))
  #data =
  

  # TODO: compute the Conditional Frequency Distribution for a tag given the previous tag
  #transition_FD =
  
  # TODO: compute the Conditional Probability Distribution for the
  # transition probability P(tag_(i+1)|tag_(i)) using the MLEProbDist
  # to estimate the probabilities
  #transition_PD =

  # TODO: compute the probabilities of P(NN|VBD) and P(NN|DT)
  #p_VBD_NN =
  #p_DT_NN =

  return (transition_FD, transition_PD,p_VBD_NN, p_DT_NN )


def test_ex4():
  tagged_sentences = brown.tagged_sents(categories= 'news')
  (transition_FD, transition_PD,p_VBD_NN, p_DT_NN ) = ex4(tagged_sentences)
  print "P(NN|VBD) = ", p_VBD_NN
  print "P(NN|DT) = ", p_DT_NN

### Uncomment to test exerise 4
# Are the results what you would expect? The sequence NN DT seems very probable. How will this affect the sequence tagging?
#test_ex4()