import sys import string from pylab import * class Hist(dict): """a histogram is a dictionary that maps from each key (x) to the number of times the key has appeared (frequency, f) """ def __init__(self, seq=[]): "create a new histogram starting with the items in seq" for x in seq: self.count(x) def count(self, x): "increment the counter associated with key x" self[x] = self.get(x, 0) + 1 def pdf(self): """return a sorted list of keys (xs) and the corresponding list of frequencies (fs). """ # sort the x,f pairs in the dictionary by x t = self.items() t.sort() xs = [x for x,f in t] fs = [f for x,f in t] return xs, fs def print_pdf(self): """print the sorted keys and the corresponding fractions""" xs, fs = self.pdf() for x, f in zip(xs, fs): print x, f def plot_pdf(self, *args, **kwds): """plot the pdf as a bar chart. args and kwds are passed along to the bar function.""" left, height = self.pdf() patches = bar(left, height, *args, **kwds) xlabel('key') ylabel('frequency') title(r'pdf') show() def cdf(self): """return a sorted list of keys (xs) and the corresponding list of cumulative fractions (the fraction of keys less than or equal to x). This is the empirical CDF of the keys in the Hist. Note: the cdf makes more sense if the keys in the Hist are numeric, but this function works for any data type that can be sorted. """ # accumulate the running sum of the frequencies and # the cumulative fraction (cf) of the total total = sum(self.values()) runsum = 0 t = self.items() t.sort() xs = [] cfs = [] def append(x, runsum): """compute the cumulative fraction and append to the lists""" cf = float(runsum) / total xs.append(x) cfs.append(cf) for x, f in t: # a cdf is really a step function, so we have to add two # points to the line for each key (before and after the update) append(x, runsum) runsum += f append(x, runsum) return xs, cfs def print_cdf(self): """print the keys (in increasing order) and the corresponding cumulative fractions""" xs, fs = self.cdf() for x, f in zip(xs, fs): print x, f def plot_cdf(self, plot_func=plot, *args, **kwds): """plot the cdf as a step function; the optional second parameter is one of the line plotting functions in pylab, either plot, semilogx, semilogy or loglog. args and kwds are passed along to the plotting function """ xs, fs = self.cdf() patches = plot_func(xs, fs, *args, **kwds) xlabel('key') ylabel('cumulative fraction') title('cdf') show() def plot_ccdf(self, plot_func=plot, *args, **kwds): xs, fs = self.cdf() fs = [1-f for f in fs] patches = plot_func(xs, fs, *args, **kwds) xlabel('key') ylabel('complementary cumulative fraction') title('ccdf') show() def main(script): def roll_dice(dice, sides=6): """roll (dice) dice with (sides) sides and return the sum """ t = [random.randint(1,sides) for x in range(dice)] return sum(t) h = Hist() for i in range(100): s = roll_dice(3) h.count(s) h.print_pdf() h.plot_pdf() h.print_cdf() h.plot_cdf(linewidth=2) if __name__ == '__main__': main(*sys.argv)