# docdist7.py # Authors: Ronald L. Rivest, Erik Demaine # Date Last Modified: February 20, 2008 # Changelog: # Version 1: # Initial version # Version 2: # Added profiling to get routine timings # Version 3: # Changed concatenate to extend in get_words_from_line_list # Version 4: # Changed count_frequency to use dictionaries instead of lists # Version 5: # Change get_words_from_string to use string translate and split # Version 6: # Changed sorting from insertion sort to merge sort # Version 7: # Remove sorting altogether via more hashing # # Usage: # docdist7.py filename1 filename2 # # This program computes the "distance" between two text files # as the angle between their word frequency vectors (in radians). # # For each input file, a word-frequency vector is computed as follows: # (1) the specified file is read in # (2) it is converted into a list of alphanumeric "words" # Here a "word" is a sequence of consecutive alphanumeric # characters. Non-alphanumeric characters are treated as blanks. # Case is not significant. # (3) for each word, its frequency of occurrence is determined # (4) the word/frequency lists are sorted into order alphabetically # # The "distance" between two vectors is the angle between them. # If x = (x1, x2, ..., xn) is the first vector (xi = freq of word i) # and y = (y1, y2, ..., yn) is the second vector, # then the angle between them is defined as: # d(x,y) = arccos(inner_product(x,y) / (norm(x)*norm(y))) # where: # inner_product(x,y) = x1*y1 + x2*y2 + ... xn*yn # norm(x) = sqrt(inner_product(x,x)) import math # math.acos(x) is the arccosine of x. # math.sqrt(x) is the square root of x. import string # string.join(words,sep) takes a given list of words, # and returns a single string resulting from concatenating them # together, separated by the string sep . # string.lower(word) converts word to lower-case import sys ################################## # Operation 1: read a text file ## ################################## def read_file(filename): """ Read the text file with the given filename; return a list of the lines of text in the file. """ try: fp = open(filename) L = fp.readlines() except IOError: print "Error opening or reading input file: ",filename sys.exit() return L ################################################# # Operation 2: split the text lines into words ## ################################################# def get_words_from_line_list(L): """ Parse the given list L of text lines into words. Return list of all words found. """ word_list = [] for line in L: words_in_line = get_words_from_string(line) # Using "extend" is much more efficient than concatenation here: word_list.extend(words_in_line) return word_list # global variables needed for fast parsing # translation table maps upper case to lower case and punctuation to spaces translation_table = string.maketrans(string.punctuation+string.uppercase, " "*len(string.punctuation)+string.lowercase) def get_words_from_string(line): """ Return a list of the words in the given input string, converting each word to lower-case. Input: line (a string) Output: a list of strings (each string is a sequence of alphanumeric characters) """ line = line.translate(translation_table) word_list = line.split() return word_list ############################################## # Operation 3: count frequency of each word ## ############################################## def count_frequency(word_list): """ Return a dictionary mapping words to frequency. """ D = {} for new_word in word_list: if D.has_key(new_word): D[new_word] = D[new_word]+1 else: D[new_word] = 1 return D ############################################# ## compute word frequencies for input file ## ############################################# def word_frequencies_for_file(filename): """ Return dictionary of (word,frequency) pairs for the given file. """ line_list = read_file(filename) word_list = get_words_from_line_list(line_list) freq_mapping = count_frequency(word_list) print "File",filename,":", print len(line_list),"lines,", print len(word_list),"words,", print len(freq_mapping),"distinct words" return freq_mapping def inner_product(D1,D2): """ Inner product between two vectors, where vectors are represented as dictionaries of (word,freq) pairs. Example: inner_product({"and":3,"of":2,"the":5}, {"and":4,"in":1,"of":1,"this":2}) = 14.0 """ sum = 0.0 for key in D1: if key in D2: sum += D1[key] * D2[key] return sum def vector_angle(D1,D2): """ The input is a list of (word,freq) pairs, sorted alphabetically. Return the angle between these two vectors. """ numerator = inner_product(D1,D2) denominator = math.sqrt(inner_product(D1,D1)*inner_product(D2,D2)) return math.acos(numerator/denominator) def main(): if len(sys.argv) != 3: print "Usage: docdist7.py filename_1 filename_2" else: filename_1 = sys.argv[1] filename_2 = sys.argv[2] sorted_word_list_1 = word_frequencies_for_file(filename_1) sorted_word_list_2 = word_frequencies_for_file(filename_2) distance = vector_angle(sorted_word_list_1,sorted_word_list_2) print "The distance between the documents is: %0.6f (radians)"%distance if __name__ == "__main__": import profile profile.run("main()")