crazy4groovy
3/15/2012 - 1:20 AM

Simple duplicate file finder via MD5 hash

Simple duplicate file finder via MD5 hash

import java.util.regex.Pattern
import java.security.MessageDigest
import groovy.io.FileType
import groovy.transform.*

def input = System.console().&readLine

String ROOT_DIRS = ( input("Enter the root directory [./] ") ?: "./" )
def SELECT_FILENAMES_WITH_REGEX = Pattern.compile( input("Enter a file filter [.*] ") ?: ".*" )
boolean waitForOK = input("Approve each file delete? [y/N] ").toLowerCase().contains("y") ? true : false

String generateMD5(final file) {
	MessageDigest digest = MessageDigest.getInstance("MD5")
	file.withInputStream() { is ->
	byte[] buffer = new byte[8192]
	int read = 0
	while( (read = is.read(buffer)) > 0 ) {
			digest.update(buffer, 0, read);
		}
	}
	byte[] md5sum = digest.digest()
	BigInteger bigInt = new BigInteger(1, md5sum)
	return bigInt.toString(16)
}

def foundClosure = { file ->
	String md5 = generateMD5(file)
	//println md5+file.absolutePath

	if (waitForOK || !md5FilePaths[md5]) {
		md5FilePaths[md5] << file.absolutePath
		print "."
	}
	else if (md5FilePaths[md5]){
		new File(file.absolutePath).delete()
		print "x"
	}

	fileCnt++
}

def walkFiles = {filepath, filterOnly, onFind, onEnd = {} ->
	try {
		File f = new File(filepath)
		f.traverse([type:FileType.FILES, nameFilter:filterOnly], onFind)
		onEnd()
	}
	catch (FileNotFoundException e) { println "ERROR: invalid file/directory"}
}

////MAIN////

@Field Map md5FilePaths = [:].withDefault{[]}
@Field int fileCnt = 0

def ts = Calendar.instance.timeInMillis
def totalDelCount = 0

ROOT_DIRS.split(',').each { dir ->
	md5FilePaths = [:].withDefault{[]}
	fileCnt = 0

	walkFiles(dir, SELECT_FILENAMES_WITH_REGEX, foundClosure)

	println "\nTotal: ${fileCnt}\nUnique: ${md5FilePaths.keySet().size()}"

	if (!waitForOK) {
		totalDelCount += (fileCnt - md5FilePaths.keySet().size())
		return;
	}

	File resultsFile = new File('dupFound.bak.txt') << ''
	resultsFile.delete()
	md5FilePaths.values().each {
		if (it.size() > 1)
			resultsFile << it[1..-1].join(';')
		resultsFile << '\n'
	}

	println "***DELETING***"

	String isOK
	int delCount = 0

	md5FilePaths.each { k,v ->
		if (waitForOK && v.size() > 1)
			println "Comparison file: ${v[0]}"
		while (v.size() > 1) {
			String f = v.pop()
			if (waitForOK) {
				isOK = input("OK to delete ${f}? [y/n/a] ")
				if (isOK.toLowerCase().contains("a"))
					waitForOK = false
			}
			else {
				println f
				isOK = "y"
			}
			if (!waitForOK || isOK.toLowerCase().contains("y")) {
				new File(f).delete()
				delCount++
			}
			else
				println "Skipped: ${f}"
		}
	}
	resultsFile.delete()

	println "***DELETED $delCount FILES***"
	totalDelCount += delCount
}

def time = ((Calendar.instance.timeInMillis - ts) / 1000)
time = Math.round((time * 100) / 60 ) / 100
println "total time: ${time} min @ ${(new Date()).toString()}"
println "total deleted: ${totalDelCount}"
println "*" * 40
System.console().readLine('Press a key to quit: ')