Source code for capidup.finddups

# CapiDup - quickly find duplicate files in directories
# Copyright (C) 2010,2014,2016 Israel G. Lugo
#
# This file is part of CapiDup.
#
# CapiDup is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, either version 3 of the License, or (at your
# option) any later version.
#
# CapiDup is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with CapiDup. If not, see <http://www.gnu.org/licenses/>.
#
# For suggestions, feedback or bug reports: israel.lugo@lugosys.com


"""This module implements the CapiDup public API.

Public functions:

    find_duplicates -- find duplicates in a list of files
    find_duplicates_in_dirs -- find duplicates in a list of directories

Public data attributes:

    MD5_CHUNK_SIZE -- block size for reading when calculating MD5
    PARTIAL_MD5_MAX_READ -- max size of partial read
    PARTIAL_MD5_READ_MULT -- partial read size must be a multiple of this
    PARTIAL_MD5_READ_RATIO -- how much (1/n) of a file to read in partial read
    PARTIAL_MD5_THRESHOLD -- file size above which a partial read is done

"""

import sys
import os
import stat
import hashlib

from capidup import py3compat


__all__ = [ "find_duplicates", "find_duplicates_in_dirs", "MD5_CHUNK_SIZE",
        "PARTIAL_MD5_READ_MULT", "PARTIAL_MD5_THRESHOLD",
        "PARTIAL_MD5_MAX_READ", "PARTIAL_MD5_READ_RATIO" ]


MD5_CHUNK_SIZE = 512 * 1024
"""Chunk size in bytes, when reading from file to calculate MD5."""

PARTIAL_MD5_READ_MULT = 4 * 1024
"""Divisor of the partial read size, in bytes.

When hashing a portion of a file for comparison, the size of that portion
will be a multiple of this value.

.. tip:: A good choice on GNU/Linux would be multiples of page size
         (usually 4096 bytes on x86).
"""

PARTIAL_MD5_THRESHOLD = 2 * PARTIAL_MD5_READ_MULT
"""Above this file size in bytes, we do a partial comparison first."""

PARTIAL_MD5_MAX_READ = 16 * PARTIAL_MD5_READ_MULT
"""Maximum size of the partial read, in bytes."""

PARTIAL_MD5_READ_RATIO = 4
"""Partial reads of 1/n of the file size (below `PARTIAL_MD5_MAX_READ`)."""



def round_up_to_mult(n, mult):
    """Round an integer up to the next multiple."""

    return ((n + mult - 1) // mult) * mult



def index_files_by_size(root, files_by_size):
    """Recursively index files under a root directory.

    Each regular file is added *in-place* to the files_by_size dictionary,
    according to the file size. This is a (possibly empty) dictionary of
    lists of filenames, indexed by file size.

    Returns True if there were any I/O errors while listing directories.

    Returns a list of error messages that occurred. If empty, there were no
    errors.

    """
    # encapsulate the value in a list, so we can modify it by reference
    # inside the auxiliary function
    errors = []

    def _print_error(error):
        """Print a listing error to stderr.

        error should be an os.OSError instance.

        """
        # modify the outside errors value; must be encapsulated in a list,
        # because if we assign to a variable here we just create an
        # independent local copy
        msg = "error listing '%s': %s" % (error.filename, error.strerror)
        sys.stderr.write("%s\n" % msg)
        errors.append(msg)



    for curr_dir, _, filenames in os.walk(root, onerror=_print_error):

        for base_filename in filenames:
            full_path = os.path.join(curr_dir, base_filename)

            file_info = os.lstat(full_path)

            # only want regular files, not symlinks
            if stat.S_ISREG(file_info.st_mode):
                size = file_info.st_size

                if size in files_by_size:
                    # append to the list of files with the same size
                    files_by_size[size].append(full_path)
                else:
                    # start a new list for this file size
                    files_by_size[size] = [full_path]

    return errors



def calculate_md5(filename, length):
    """Calculate the MD5 hash of a file, up to length bytes.

    Returns the MD5 in its binary form, as an 8-byte string. Raises IOError
    or OSError in case of error.

    """
    assert length >= 0

    # shortcut: MD5 of an empty string is 'd41d8cd98f00b204e9800998ecf8427e',
    # represented here in binary
    if length == 0:
        return '\xd4\x1d\x8c\xd9\x8f\x00\xb2\x04\xe9\x80\t\x98\xec\xf8\x42\x7e'

    md5_summer = hashlib.md5()

    f = open(filename, 'rb')

    try:
        bytes_read = 0

        while bytes_read < length:
            chunk_size = min(MD5_CHUNK_SIZE, length - bytes_read)

            chunk = f.read(chunk_size)

            if not chunk:
                # found EOF: means length was larger than the file size, or
                # file was truncated while reading -- print warning?
                break

            md5_summer.update(chunk)

            bytes_read += len(chunk)

    finally:
        f.close()

    md5 = md5_summer.digest()

    return md5



def find_duplicates(filenames, max_size):
[docs] """Find duplicates in a list of files, comparing up to `max_size` bytes. Returns a 2-tuple of two values: ``(duplicate_groups, errors)``. `duplicate_groups` is a (possibly empty) list of lists: the names of files that have at least two copies, grouped together. `errors` is a list of error messages that occurred. If empty, there were no errors. For example, assuming ``a1`` and ``a2`` are identical, ``c1`` and ``c2`` are identical, and ``b`` is different from all others:: >>> dups, errs = find_duplicates(['a1', 'a2', 'b', 'c1', 'c2'], 1024) >>> dups [['a1', 'a2'], ['c1', 'c2']] >>> errors [] Note that ``b`` is not included in the results, as it has no duplicates. """ errors = [] # shortcut: can't have duplicates if there aren't at least 2 files if len(filenames) < 2: return [], errors # shortcut: if comparing 0 bytes, they're all the same if max_size == 0: return [filenames], errors files_by_md5 = {} for filename in filenames: try: md5 = calculate_md5(filename, max_size) except EnvironmentError as e: msg = "unable to calculate MD5 for '%s': %s" % (filename, e.strerror) sys.stderr.write("%s\n" % msg) errors.append(msg) continue if md5 not in files_by_md5: # unique beginning so far; index it on its own files_by_md5[md5] = [filename] else: # found a potential duplicate (same beginning) files_by_md5[md5].append(filename) # Filter out the unique files (lists of files with the same md5 that # only contain 1 file), and create a list of the lists of duplicates. # Don't use values() because on Python 2 this creates a list of all # values (file lists), and that may be very large. duplicates = [l for l in py3compat.itervalues(files_by_md5) if len(l) >= 2] return duplicates, errors def find_duplicates_in_dirs(directories):
[docs] """Recursively scan a list of directories, looking for duplicate files. Returns a 2-tuple of two values: ``(duplicate_groups, errors)``. `duplicate_groups` is a (possibly empty) list of lists: the names of files that have at least two copies, grouped together. `errors` is a list of error messages that occurred. If empty, there were no errors. For example, assuming ``./a1`` and ``dir1/a2`` are identical, ``dir1/c1`` and ``dir2/c2`` are identical, and ``dir2/b`` is different from all others: >>> dups, errs = find_duplicates(['.', 'dir1', 'dir2']) >>> dups [['./a1', 'dir1/a2'], ['dir1/c1', 'dir2/c2']] >>> errors [] """ errors_in_total = [] files_by_size = {} # First, group all files by size for directory in directories: sub_errors = index_files_by_size(directory, files_by_size) errors_in_total += sub_errors all_duplicates = [] # Now, within each file size, check for duplicates. # # We use an iterator over the dict (which gives us the keys), instead # of explicitly accessing dict.keys(). On Python 2, dict.keys() returns # a list copy of the keys, which may be very large. for size in iter(files_by_size): # for large file sizes, divide them further into groups by matching # initial portion; how much of the file is used to match depends on # the file size if size >= PARTIAL_MD5_THRESHOLD: partial_size = min(round_up_to_mult(size // PARTIAL_MD5_READ_RATIO, PARTIAL_MD5_READ_MULT), PARTIAL_MD5_MAX_READ) possible_duplicates_list, sub_errors = find_duplicates(files_by_size[size], partial_size) errors_in_total += sub_errors else: # small file size, group them all together and do full MD5s possible_duplicates_list = [files_by_size[size]] # Do full MD5 scan on suspected duplicates. calculate_md5 (and # therefore find_duplicates) needs to know how many bytes to scan. # We're using the file's size, as per stat(); this is a problem if # the file is growing. We'll only scan up to the size the file had # when we indexed. Would be better to somehow tell calculate_md5 to # scan until EOF (e.g. give it a negative size). for possible_duplicates in possible_duplicates_list: duplicates, sub_errors = find_duplicates(possible_duplicates, size) all_duplicates += duplicates errors_in_total += sub_errors return all_duplicates, errors_in_total # vim: set expandtab smarttab shiftwidth=4 softtabstop=4 tw=75 :