# Homework 4 Solutions # Software Design # Allen Downey # Three things to notice: # 1) The function process_file returns a dictionary # that contains the words from the file and their # frequencies. I use this function twice, to analyze # the book, and also to process the word list. # 2) I used the function subtract to # find the words that appear in the book but not in the # dictionary. This is a general-purpose function that # is likely to be reusable. # 3) The decorate-sort-undecorate (DSU) pattern I used to # find the ten most common words. # Here are the results from The Great Gatsby: # There are 50444 words in the book. # There are 5824 different words in the book. # The ten most common words are: # the 2405 # and 1573 # a 1409 # i 1392 # to 1153 # of 1119 # he 854 # in 806 # was 769 # The words in the book that aren't in the word list are: # 7.15 monte disarray lapped magnanimous stunned settee [and many more...] import sys, string count = 0 # global variable that counts the number of words # trans is used to replace hyphens and apostrophes with spaces trans = string.maketrans("-'", " ") def process_line(line, d): """add the words in line to dictionary d, and increment count""" global count # replace hyphens and apostrophes with spaces line = line.translate(trans) # split the line using whitespace t = line.split() for word in t: # remove punctuation and convert to lowercase word = word.strip(string.punctuation + string.whitespace) word = word.lower() # update count and the word frequency d[word] = d.get(word, 0) + 1 count += 1 def process_file(filename): """make and return a dictionary that contains the words from the given file and the number of times they appear""" d = {} fp = open(filename, 'r') for line in fp: process_line(line, d) return d def sorted_list(d): """make a list of the key-value pairs from d and sort them in descending order by value""" t = [(value, key) for (key, value) in d.items()] t.sort() t.reverse() return t def subtract(d1, d2): """return a dictionary with all keys that appear in d1 but not d2""" res = {} for key in d1.keys(): if not d2.has_key(key): res[key] = 1 return res def main(name, filename, words='words'): book = process_file(filename) print 'There are %d words in the book.' % count print 'There are %d different words in the book.' % len(book) t = sorted_list(book) print 'The ten most common words are:' for (freq, word) in t[0:9]: print '%s\t%d' % (word, freq) words = process_file(words) diff = subtract(book, words) print "The words in the book that aren't in the word list are:" for word in diff.keys(): print word, if __name__ == '__main__': main(*sys.argv)