import sys
import string
from pylab import *


class Hist(dict):
    """a histogram is a dictionary that maps from each key (x) to the
    number of times the key has appeared (frequency, f)
    """
    
    def __init__(self, seq=[]):
        "create a new histogram starting with the items in seq"
        for x in seq:
            self.count(x)

    def count(self, x):
        "increment the counter associated with key x"
        self[x] = self.get(x, 0) + 1

    def pdf(self):
        """return a sorted list of keys (xs) and the corresponding
        list of frequencies (fs).
        """
        # sort the x,f pairs in the dictionary by x
        t = self.items()
        t.sort()
        xs = [x for x,f in t]
        fs = [f for x,f in t]
        return xs, fs
        
    def print_pdf(self):
        """print the sorted keys and the corresponding fractions"""
        xs, fs = self.pdf()
        for x, f in zip(xs, fs):
            print x, f

    def plot_pdf(self, *args, **kwds):
        """plot the pdf as a bar chart.  args and kwds are passed
        along to the bar function."""
        left, height = self.pdf()
        patches = bar(left, height, *args, **kwds)
        xlabel('key')
        ylabel('frequency')
        title(r'pdf')
        show()

    def cdf(self):
        """return a sorted list of keys (xs) and the corresponding
        list of cumulative fractions (the fraction of keys less
        than or equal to x).  This is the empirical CDF of the keys
        in the Hist.

        Note: the cdf makes more sense if the keys in the Hist
        are numeric, but this function works for any data type that
        can be sorted.
        """
        # accumulate the running sum of the frequencies and
        # the cumulative fraction (cf) of the total
        total = sum(self.values())
        runsum = 0
        t = self.items()
        t.sort()
        xs = []
        cfs = []

        def append(x, runsum):
            """compute the cumulative fraction and append to the lists"""
            cf = float(runsum) / total
            xs.append(x)
            cfs.append(cf)

        
        for x, f in t:
            # a cdf is really a step function, so we have to add two
            # points to the line for each key (before and after the update)
            append(x, runsum)
            runsum += f
            append(x, runsum)

        return xs, cfs
    
    def print_cdf(self):
        """print the keys (in increasing order) and the corresponding
        cumulative fractions"""
        xs, fs = self.cdf()
        for x, f in zip(xs, fs):
            print x, f

    def plot_cdf(self, plot_func=plot, *args, **kwds):
        """plot the cdf as a step function; the optional second
        parameter is one of the line plotting functions in pylab,
        either plot, semilogx, semilogy or loglog.

        args and kwds are passed along to the plotting function
        """
        xs, fs = self.cdf()
        patches = plot_func(xs, fs, *args, **kwds)
        xlabel('key')
        ylabel('cumulative fraction')
        title('cdf')
        show()

    def plot_ccdf(self, plot_func=plot, *args, **kwds):
        xs, fs = self.cdf()
        fs = [1-f for f in fs]
        patches = plot_func(xs, fs, *args, **kwds)
        xlabel('key')
        ylabel('complementary cumulative fraction')
        title('ccdf')
        show()

def main(script):
    def roll_dice(dice, sides=6):
        """roll (dice) dice with (sides) sides and return the sum
        """
        t = [random.randint(1,sides) for x in range(dice)]
        return sum(t)

    h = Hist()
    for i in range(100):
        s = roll_dice(3)
        h.count(s)

    h.print_pdf()
    h.plot_pdf()
    h.print_cdf()
    h.plot_cdf(linewidth=2)

if __name__ == '__main__':
    main(*sys.argv)
