#!/usr/bin/python # This is a python2 program """ This program takes a list of relative probabilities or frequencies on the command-line and uses these to construct an optimal binary symbol code with the Huffman algorithm. These commands produce the same code: python huffman.py 0.1 0.2 0.7 python huffman.py 100 200 700 This program is written for teaching purposes, and outputs the codes as human-readable strings rather than packing bits into binary files. Provided to help explore the results of the Huffman algorithm, not a demonstration of how to write code! It is currently left as an exercise for the reader to actually encode and decode files with the generated symbol code. You could also look at David MacKay's demonstrations of several compression algorithms: http://www.inference.phy.cam.ac.uk/mackay/python/compress/ Iain Murray, October 2011. """ INF = 1e999 def min_argmin(array): """Returns the minimum element of an array, and its index.""" mn = min(array) return (mn, array.index(mn)) def huffman(probs): """Return Huffman codewords for the given probability distribution.""" nodes = [[x] for x in range(len(probs))] merged_probs = probs[:] while len(nodes) > 1: # find two least probable nodes: (mn, idx) = min_argmin(merged_probs) merged_probs[idx] = INF (mn2, idx2) = min_argmin(merged_probs) # merge them: merged_probs[idx] = mn + mn2; del merged_probs[idx2] nodes[idx] = [nodes[idx], nodes[idx2]] del nodes[idx2] # Recursive navigation of tree of nested lists to construct codes def huffman_helper(cur_code, nodes, codes): if len(nodes) == 1: symbol = nodes[0] codes[symbol] = cur_code else: huffman_helper(cur_code + '0', nodes[0], codes) huffman_helper(cur_code + '1', nodes[1], codes) codes = ['' for x in range(len(probs))] huffman_helper('', nodes[0], codes) return codes def symbol_code_expected_length(probs, codes): return sum(x*len(y) for (x,y) in zip(probs, codes)) def Hbits(probs): """Entropy of discrete distribution, measured in bits.""" from math import log return sum(-x*log(x, 2) for x in probs if x !=0) def main(): import sys if len(sys.argv) < 2: print __doc__ sys.exit(1) probs = map(float, sys.argv[1:]) Z = sum(probs) probs = [x/Z for x in probs] codes = huffman(probs) Lbar = symbol_code_expected_length(probs, codes) print 'Codewords:' print '----------' for cc in codes: print cc print '----------' print 'Expected length: %g bits/symbol' % Lbar print 'Entropy of dist: %g bits/symbol' % Hbits(probs) if __name__ == "__main__": main()