samba
1/11/2017 - 7:00 AM

MD5 Index Scanner

MD5 Index Scanner

#!/usr/bin/env python

"""
Scan an index of files, typically keyed by MD5 checksum, in the format produced by
GNU `md5sum` (not BSD-style), searching for one of many checksums listed in a pattern file.

The primary objective is to help find duplicates in a large collection of files, e.g. an
archive of music. Example:

> find ./music -type f -print0 | xargs -0 md5sum | sort > index.txt
> cut -c 1-33 index.txt  | uniq -c |  awk '{ if($1 > 1){ print $2 }  }' > dupe_checksums.txt
> python scandupes.py dupe_checksums.txt index.txt | xargs -d "\n" mv -vn "{}" ./dupes/


File Examples:

<index file>
009d2ba87423037d2b428953530720ac  ./file1
019f8ae032f0db625f6de28226ca0af6  ./file2

<pattern file>
009d2ba87423037d2b428953530720ac
109d2ba87423037d2b428953530720ac
209d2ba87423037d2b428953530720ac

"""

import fileinput
import sys
import os

def notify(text, *args, **kwargs):
    print >>sys.stderr, text.format(*args, **kwargs)

def scan_checksums(filename):
    for line in fileinput.FileInput(filename):
        parts = line.strip().split('  ')
        yield (parts[0], (parts[1] if len(parts) > 1 else None))
        
def find_matching(pattern_file, scan_file):
    
    notify('Loading pattern file {0}', pattern_file)
    search = set([ p[0] for p in scan_checksums(pattern_file) ])
    
    notify('Scanning index file {0} with {1:d} patterns',
        scan_file, len(search))
        
    for checksum, filename in scan_checksums(scan_file):
        if checksum in search:
            yield filename


def main(args):
    pattern_file = args[0]
    scan_file = args[1]
    for filename in find_matching(pattern_file, scan_file):
        if os.path.isfile(filename):
            yield filename
    
    
if __name__ == '__main__':
     for line in main(sys.argv[1:]):
        print line