home/sbin/dedup

#!/usr/bin/env python
from __future__ import unicode_literals
import argparse, functools, hashlib, logging, os, re, sqlite3, stat, sys
sys.path = ["."] + sys.path

class SqliteMultiDict(object):

    def __init__(self, db, table_name):
        self.db = db
        self.escaped_table_name = table_name.replace('"', '""')
        self.db.executescript("""
        CREATE TABLE IF NOT EXISTS "{0}" ("key" BLOB, "value" BLOB);
        CREATE INDEX IF NOT EXISTS "{0}_index" ON "{0}" ("key");
        """.format(self.escaped_table_name))

    def __contains__(self, key):
        return bool(tuple(self.db.execute("""
        SELECT 1 FROM "{0}" WHERE "key" = ? LIMIT 1
        """.format(self.escaped_table_name), [key])))

    def __getitem__(self, key):
        return (value for value, in self.db.execute("""
        SELECT "value" FROM "{0}" WHERE "key" = ?
        ORDER BY "value" ASC
        """.format(self.escaped_table_name), [key]))

    def clear(self):
        self.db.execute("""
        DELETE FROM "{0}"
        """.format(self.escaped_table_name))

    def insert(self, key, value):
        self.db.execute("""
        INSERT INTO "{0}" ("key", "value") VALUES (?, ?)
        """.format(self.escaped_table_name), [key, value])

    def items(self):
        return self.db.execute("""
        SELECT "key", "value" FROM "{0}" ORDER BY "key" ASC
        """.format(self.escaped_table_name))

    def nonunique_keys(self):
        return (key for key, in self.db.execute("""
        SELECT "key" FROM "{0}" GROUP BY "key" HAVING COUNT(*) > 1
        ORDER BY "key" ASC
        """.format(self.escaped_table_name)))

    def nonunique_items(self):
        return self.db.execute("""
        SELECT "key", "value" FROM "{0}" INNER JOIN
        (SELECT "key" AS "key2" FROM "{0}" GROUP BY "key" HAVING COUNT(*) > 1)
        ON "key" = "key2"
        ORDER BY "key" ASC
        """.format(self.escaped_table_name))

PREFIX_DIR = "D"
PREFIX_LNK = "L"
PREFIX_REG = "F"

def shell_escape(s):
    if re.match(r"[A-Za-z0-9,.:/_+-]*$", s):
        return s
    return "'{0}'".format(s.replace("'", "'\\''"))

def hexdigest_str(hasher):
    # Python 2 compatibility: ensure it's a Unicode string
    return hasher.hexdigest().encode("utf-8").decode("utf-8")

def hash_bytes(algorithm, bytestring):
    hasher = hashlib.new(algorithm)
    hasher.update(bytestring)
    return hexdigest_str(hasher)

def hash_str(algorithm, string):
    return hash_bytes(algorithm, string.encode("utf-8"))

def hash_file(algorithm, filename, blocksize=65536, count=None):
    hasher = hashlib.new(algorithm)
    if os.path.islink(filename):
        prefix = PREFIX_LNK
        hasher.update(os.readlink(filename).encode("utf-8"))
    else:
        prefix = PREFIX_REG
        with open(filename, "rb") as f:
            for i, block in enumerate(iter(lambda: f.read(blocksize), b"")):
                if count is not None and i >= count:
                    break
                hasher.update(block)
    return prefix + hexdigest_str(hasher)

def hash_dir_entries(algorithm, hashes):
    hasher = hashlib.new(algorithm)
    for h in hashes:
        hasher.update(h.encode("utf-8"))
    return PREFIX_DIR + hexdigest_str(hasher)

def traverse_dir(path, listdir, proc_file, proc_entries):
    '''
    listdir: (path) -> [path]
    proc_file: (path) -> a
    proc_entries: (path, [a]) -> a
    '''
    children = listdir(path)
    if children is None:                # file / symbolic link
        return proc_file(path)
    else:                               # otherwise, directory
        exception = None
        results = []
        for child in children:
            try:
                results.append(traverse_dir(os.path.join(path, child),
                                            listdir, proc_file, proc_entries))
            except OSError as e:
                if exception is not None:
                    logging.warn(e)
                exception = e
        if exception is not None:
            raise exception
        return proc_entries(path, results)

def try_remove(path):
    try:
        os.remove(path)
    except OSError:
        pass

def remove_sqlite_db(filename):
    if filename != ":memory:":
        try_remove(filename)

def init_cache_db(db):
    db.executescript("""
    CREATE TABLE IF NOT EXISTS "files"
      ( "id" INTEGER PRIMARY KEY
      , "path" TEXT UNIQUE NOT NULL
      , "entries" TEXT
      , "is_link" INTEGER NOT NULL
      , "size" INTEGER NOT NULL
      );
    CREATE TABLE IF NOT EXISTS "total_sizes"
      ( "file_id" INTEGER PRIMARY KEY
      , "total_size" INTEGER NOT NULL
      );
    CREATE TABLE IF NOT EXISTS "hashes"
      ( "algorithm" TEXT NOT NULL
      , "file_id" INTEGER NOT NULL
      , "hash" TEXT NOT NULL
      , UNIQUE ("algorithm", "file_id")
      );
    """)

def stat_file_info(path):
    st = os.lstat(path)
    if stat.S_ISDIR(st.st_mode):
        entries = sorted(os.listdir(path))
    elif stat.S_ISLNK(st.st_mode) or stat.S_ISREG(st.st_mode):
        entries = None
    else:
        raise OSError("not a regular file or symbolic link: {0!r}"
                      .format(path))
    return {
        "path": path,
        "entries": entries,
        "is_link": stat.S_ISLNK(st.st_mode),
        "size": st.st_size,
    }

def load_file_info(db, path):
    for file_id, entries, is_link, size in db.execute(
            'SELECT "id", "entries", "is_link", "size" FROM "files" '
            'WHERE "path" = ?', [path]):
        if entries is not None:
            if entries == "":
                entries = []
            else:
                entries = entries.split("\0")
        return {
            "id": file_id,
            "path": path,
            "entries": entries,
            "is_link": is_link,
            "size": size,
        }
    return None

def save_file_info(db, path, entries, is_link, size):
    file_info = {
        "path": path,
        "entries": entries,
        "is_link": is_link,
        "size": size,
    }
    if entries is not None:
        entries = "\0".join(entries)
    cur = db.cursor()
    cur.execute('INSERT INTO "files" ("path", "entries", "is_link", "size") '
                'VALUES (?, ?, ?, ?)', [path, entries, is_link, size])
    file_info["id"] = cur.lastrowid
    return file_info

def get_file_info(db, path):
    file_info = load_file_info(db, path)
    if file_info is None:
        file_info = save_file_info(db, **stat_file_info(path))
    return file_info

def load_dir_total_size(db, path):
    for total_size, in db.execute(
            'SELECT "total_size" FROM "total_sizes" '
            'INNER JOIN "files" ON "file_id" = "files"."id" '
            'WHERE "path" = ?', [path]):
        return total_size
    return None

def save_dir_total_size(db, path, total_size):
    file_id = get_file_info(db, path)["id"]
    db.execute('INSERT INTO "total_sizes" ("file_id", "total_size") '
               'VALUES (?, ?)', [file_id, total_size])

def get_dir_total_size(db, path, sizes):
    total_size = load_dir_total_size(db, path)
    if sizes is None:
        return None
    if total_size is None:
        total_size = sum(sizes)
        save_dir_total_size(db, path, total_size)
    return total_size

def get_dir_entries(db, path):
    return get_file_info(db, path)["entries"]

def load_file_hash(db, algorithm, path):
    for h, in db.execute('SELECT "hash" FROM "hashes" INNER JOIN "files" '
                         'ON "file_id" = "files"."id" '
                         'WHERE "algorithm" = ? AND "path" = ?',
                         [algorithm, path]):
        return h
    return None

def save_file_hash(db, algorithm, path, h):
    file_id = get_file_info(db, path)["id"]
    db.execute('INSERT INTO "hashes" '
               '("algorithm", "file_id", "hash") VALUES (?, ?, ?)',
               [algorithm, file_id, h])

def get_file_hash(db, algorithm, path):
    h = load_file_hash(db, algorithm, path)
    if h is None:
        h = hash_file(algorithm, path)
        save_file_hash(db, algorithm, path, h)
    return h

def remove_redundant_paths(paths_multidict, scratch_multidict):
    scratch_multidict.clear()
    for path, _ in paths_multidict.items():
        scratch_multidict.insert(os.path.join(path, ""), None)
    paths_multidict.clear()
    last_path = None
    for path, _ in scratch_multidict.items(): # .items() is always sorted
        if last_path is None or not path.startswith(last_path):
            last_path = path
            paths_multidict.insert(os.path.normpath(path), None)
    scratch_multidict.clear()

def main_dedup(paths, cache_fn, hash_algorithm, scratch_fn, size_threshold):
    hasher = hashlib.new(hash_algorithm)

    cache_db = sqlite3.connect(cache_fn)
    with cache_db:
        init_cache_db(cache_db)

    listdir = functools.partial(get_dir_entries, cache_db)

    remove_sqlite_db(scratch_fn)
    scratch_db = sqlite3.connect(scratch_fn)
    try:
        with scratch_db, cache_db:
            paths_multidict = SqliteMultiDict(scratch_db, "paths")
            paths_scratch_multidict = SqliteMultiDict(scratch_db, "paths_scratch")
            paths_by_size_multidict = SqliteMultiDict(scratch_db, "size")
            paths_by_hash_multidict = SqliteMultiDict(scratch_db, "hash")

            paths_multidict.clear()
            for path in map(os.path.realpath, paths):
                paths_multidict.insert(path, None)

            remove_redundant_paths(paths_multidict, paths_scratch_multidict)

            logging.info("Performing rough scan based on size...")

            def proc_file(path):
                size = get_file_info(cache_db, path)["size"]
                h = hash_str(hash_algorithm, str(size))
                paths_by_size_multidict.insert(h, path)
                return h, size
            def proc_entries(path, hash_and_sizes):
                size = get_dir_total_size(cache_db, path,
                                          (size for _, size in hash_and_sizes))
                h = hash_str(hash_algorithm,
                             "".join(h for h, _ in hash_and_sizes))
                paths_by_size_multidict.insert(h, path)
                return h, size
            for path, _ in paths_multidict.items():
                try:
                    traverse_dir(path, listdir, proc_file, proc_entries)
                except OSError as e:
                    logging.warn(e)

            paths_multidict.clear()
            for _, path in paths_by_size_multidict.nonunique_items():
                paths_multidict.insert(path, None)

            remove_redundant_paths(paths_multidict, paths_scratch_multidict)

            logging.info("Performing detailed scan based on hashes...")

            def proc_file(path):
                h = get_file_hash(cache_db, hash_algorithm, path)
                paths_by_hash_multidict.insert(h, path)
                return h
            def proc_entries(path, hashes):
                h = hash_dir_entries(hash_algorithm, hashes)
                paths_by_hash_multidict.insert(h, path)
                return h
            for path, _ in paths_multidict.items():
                try:
                    traverse_dir(path, listdir, proc_file, proc_entries)
                except OSError as e:
                    logging.warn(e)

            paths_multidict.clear()
            for _, path in paths_by_hash_multidict.nonunique_items():
                paths_multidict.insert(path, None)

            remove_redundant_paths(paths_multidict, paths_scratch_multidict)

            for i, h in enumerate(paths_by_hash_multidict.nonunique_keys()):
                path0 = next(paths_by_hash_multidict[h])
                file_info = get_file_info(cache_db, path0)
                if file_info["entries"] is not None:
                    total_size = load_dir_total_size(cache_db, path0)
                    if total_size < size_threshold:
                        continue
                    extra = ("directory: {0} B, {1} entries"
                             .format(total_size, len(file_info["entries"])))
                else:
                    if file_info["size"] < size_threshold:
                        continue
                    if file_info["is_link"]:
                        extra = "link"
                    else:
                        extra = "file: {0} B".format(file_info["size"])
                if i != 0:
                    sys.stdout.write("\n")
                sys.stdout.write("# {0} ({1})\n".format(h[1:], extra))
                for path in paths_by_hash_multidict[h]:
                    sys.stdout.write("#rm {0}\n".format(shell_escape(path)))
                sys.stdout.flush()

    finally:
        remove_sqlite_db(scratch_fn)

def main():
    logging.basicConfig(format="# [%(levelname)s] %(message)s",
                        level=logging.INFO)
    p = argparse.ArgumentParser(description="Tool for finding duplicate files.")
    p.add_argument("--hash-algorithm", metavar="<algorithm>", default="sha1")
    p.add_argument("--scratch-file", metavar="<file>", dest="scratch_fn",
                   default=":memory:")
    p.add_argument("--size-threshold", "-s", metavar="<bytes>", default=0,
                   type=int, help="Ignore files smaller than this size")
    p.add_argument("--cache-file", "-f", metavar="<file>", dest="cache_fn",
                   default=":memory:")
    p.add_argument(metavar="<path>", dest="paths", nargs="+")
    return main_dedup(**vars(p.parse_args()))

if __name__ == "__main__":
    exit(main())