Hello
this is a simple script to find the duplicated files by md5sum
so if u have 2 files with the same content but with different names, u still can catch them
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
#duplicate file finder by file md5sum #author N1X import sys import os import subprocess from os.path import join, abspath from os import walk from time import sleep try: sys.argv[1] except IndexError: print "usage: python filedub.py /full/path/to/dir/" sys.exit() fileList = {} targetdir = sys.argv[1] totalfiles = 0 devnull = open('/dev/null', 'w') print "#" * 20 print '#' * 5 , 'Scan Start on :' , targetdir print '#' * 20 print 'Total Founded:' for root, dirs, files in os.walk(targetdir, topdown=True, onerror=None, followlinks=False): totalfiles += len(files) for file in files: file = abspath(join(root, file)) cmd = 'md5sum "%s"' % file sum = subprocess.Popen(cmd , stderr=subprocess.PIPE, shell=True, stdout=subprocess.PIPE) sum = sum.communicate()[0] print sum sum = sum.split() try: sum[0] except IndexError: continue sum = sum[0] if fileList.has_key(sum): fileList[sum].append(file) else: fileList[sum] = [file] print '\r%s'%totalfiles, sys.stdout.flush() sleep(0.5) for key in fileList.keys(): if len(fileList[key]) > 1 : print "\n" print "Total Duplicate for checksum[%s] is : %s)" %(key,len(fileList[key])) i=0 for dub in fileList[key]: i = i + 1 print i,":" ,dub,"if you want to delete this file pres y" action = raw_input('--> ') if action == "y": os.remove(dub) |