#!/usr/bin/python -E # (C) 2007,2008 Jack Lloyd # GPL v2 import sys, md5, os.path """ At each step, we have dict{hash} -> list of files If two files are in the same list, they are potentially the same contents b/c they share the same hash. Initial step, dict{size_of_file} -> list of all files of size N To filter again, all files with the same hash (each list in dict.values()) are rehashed with a new hash function f() and reindexed. If there is only one file with a particular hash, it can be dropped from future consideration: assumption is file1 == file2 -> h(file1) == h(file2) and so if only a single file has a particular hash, it cannot possibily be a duplicate of any other files. """ def filter_dups(fn, input): output = {} for maybe_dups in input.itervalues(): # if only one value in this hash bucket, not a dup if len(maybe_dups) <= 1: continue for file in maybe_dups: output.setdefault(fn(file), []).append(file) return output def hash_n_kb(n): def hash_n(filename): file = open(filename) hash = md5.new() read = 0 while True: data = file.read(1024) read += len(data) if len(data) == 0: break # EOF if n != None and read >= n*1024: break # Hit the limit hash.update(data) #file.close() return "md5(%d)=%s" % (read, hash.digest().encode('hex')) return hash_n def main(argv = None): if argv is None: argv = sys.argv from os import walk import getopt try: opts, args = getopt.getopt(argv[1:], "hf:", ["help"]) except getopt.GetoptError, err: print str(err) return 2 # Default is filter by size, then hash of first 4 Kb, then full file filter_kb = [4] for opt, arg in opts: if opt in ("-h", "--help"): print "Usage: %s [-f KB1[,KB2[,...]]] " % (argv[0]) elif opt == "-f": filter_kb = map(lambda x: int(x), arg.split(',')) dups = {} for arg in args: root = os.path.realpath(arg) for root, dirs, files in os.walk(root): for file in files: path = os.path.join(root, file) if not os.path.islink(path): dups.setdefault(os.stat(path).st_size, []).append(path) for filter in map(hash_n_kb, filter_kb): dups = filter_dups(filter, dups) # Final pass: hash the entire file to confirm dups (if needed) dups = filter_dups(hash_n_kb(None), dups) for k, v in dups.iteritems(): if len(v) <= 1: continue v.sort() print k, " ".join(v) if __name__ == '__main__': sys.exit(main())