#!/bin/bash # findup - find duplicate files # Copyright © 2000-2009 by Pádraig Brady . # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # See the GNU General Public License for more details, # which is available at www.gnu.org # Description # # will show duplicate files in the specified directories # (and their subdirectories), in the format: # # file1 # file2 # # file3 # file4 # file5 # # or if the --summary option is specified: # # 2 * 2048 file1 file2 # 3 * 1024 file3 file4 file5 # # Where the number is the disk usage in bytes of each of the # duplicate files on that line, and all duplicate files are # shown on the same line. # Output it ordered by largest disk usage first and # then by the number of duplicate files. # # Caveats/Notes: # I compared this to any equivalent utils I could find (as of Nov 2000) # and it's (by far) the fastest, has the most functionality (thanks to # find) and has no (known) bugs. In my opinion fdupes is the next best but # is slower (even though written in C), and has a bug where hard links # in different directories are reported as duplicates sometimes. # # This script requires uniq > V2.0.21 (part of GNU textutils|coreutils) # dir/file names containing \n are ignored # undefined operation for dir/file names containing \1 # sparse files are not treated differently. # Don't specify params to find that affect output etc. (e.g -printf etc.) # zero length files are ignored. # symbolic links are ignored. # path1 & path2 can be files &/or directories script_dir=$(dirname "$0") #directory of this script script_dir=$(readlink -f "$script_dir") #Make sure absolute path . "$script_dir"/supprt/fslver Usage() { ProgName=$(basename "$0") echo "find dUPlicate files. Usage: $ProgName [[[-t [-m|-d]] | [--summary]] [-r] [-f] paths(s) ...] If no path(s) specified then the current directory is assumed. When -m is specified any found duplicates will be merged (using hardlinks). When -d is specified any found duplicates will be deleted (leaving just 1). When -t is specfied, only report what -m or -d would do. When --summary is specified change output format to include file sizes. You can also pipe this summary format to "$script_dir"/fstool/dupwaste to get a total of the wastage due to duplicates. Examples: search for duplicates in current directory and below findup or findup . search for duplicates in all linux source directories and merge using hardlinks findup -m /usr/src/linux* same as above but don't look in subdirectories findup -r . search for duplicates in /usr/bin findup /usr/bin search in multiple directories but not their subdirectories findup -r /usr/bin /bin /usr/sbin /sbin search for duplicates in \$PATH findup \$("$script_dir"/supprt/getffp) search system for duplicate files over 100K in size findup / -size +100k search only my files (that I own and are in my home dir) findup ~ -user \$(id -u) search system for duplicate files belonging to roger findup / -user \$(id -u roger)" exit } cleanup_sum() { sed ' # md5sum and sha1sum et. al. from coreutils at least, # to deal with \n in filenames, convert any \ and \n chars # to \\ and \\n respectively. Currently we ignore files with \n # so just undo this problematic escaping /^\\/{s/.//; s/\\\\/\\/g}; # These utils also add a "*" flag character for normal files # on platforms where O_BINARY is significant (like CYGWIN). # We always process in binary mode and so remove that flag here s/^\([^ ]*\) \*/\1 /; ' } for arg do case "$arg" in -h|--help|-help) Usage ;; -v|--version) Version ;; --summary) mode="summary" ;; -m) mode="merge" ;; -d) mode="del" ;; -t) t="t" ;; *) argsToPassOn="$argsToPassOn $(shell_quote "$arg")" ;; esac done sep_mode="separate" if [ "$mode" = "summary" ]; then #Don't do extra hardlink processing. #This speeds things up, and also removes the python dependency merge_early="-u" fi . "$script_dir"/supprt/getfpf "$argsToPassOn" check_uniq dev_id="$(find /bin/sh -printf '%D' 2>/dev/null)" if [ "$dev_id" = "D" ] || [ ! "$dev_id" ]; then devFmt="\060" #0 else devFmt=%D #This is new in findutils-4.2 and will help find more duplicates fi #print name, dev, inode & size. find "$@" -size +0c -type f ! -name "*$LF*" -printf "$FPF\0$devFmt\0%i\0%s\n" | sort -u | #merge files (indirectly) specified multiple times tr ' \t\0' '\0\1 ' | #remove spaces, tabs in file names sort -k4,4nr -k2,2n -k3,3 $merge_early |#group [and merge] size,dev & inodes if [ -z "$merge_early" ]; then "$script_dir"/supprt/rmlint/merge_hardlinks else uniq -3 -D #pick just duplicate filesizes fi | sort -k2,2n -k3,3n | #NB sort inodes so md5sum does less seeking all over disk cut -f1 -d' ' -s | #get filenames to work on tr '\0\1\n' ' \t\0' |#reset any space & tabs etc and delimit names with \0 # The following optional block, md5sums a small sample of each file, # which can help when there are many files of the same size, # even more so if they are large. This usually adds a small amount of # runtime, however it can save a large amount of time in certain situations. if "$script_dir"/supprt/md5sum_approx /dev/null; then xargs -r0 "$script_dir"/supprt/md5sum_approx | sort | #group duplicate files together uniq --all-repeated -w32 | #pick just duplicates cut -d' ' -f3- | #get filenames sort | #sort by paths to try to minimise disk seeks tr '\n' '\0' #delimit names with \0 else cat fi | # This block selects duplicates using md5sum of whole file xargs -r0 md5sum -- | #calculate md5sums for possible duplicates cleanup_sum | #undo any backslash escaping sort | #group duplicate files together uniq --all-repeated=$sep_mode -w32 | #pick just duplicates # The following optional block, checks duplicates again using sha1 # Note for data sets that don't totally fit in cache this will # probably read duplicate files off the disk again. cut -s -d' ' -f3- | #get filenames sort | #sort by paths to try to minimise disk seeks tr '\n' '\0' | #delimit names with \0 xargs -r0 sha1sum -- | #to be sure to be sure cleanup_sum | #undo any backslash escaping sort | #group duplicate files together uniq --all-repeated=$sep_mode -w40 | #pick just duplicates cut -d' ' -f3- | #get filenames (and leave separating lines) if [ "$mode" ]; then if [ ! $mode = "summary" ]; then # external call to python as this is faster if "$script_dir"/supprt/rmlint/fixdup /dev/null; then "$script_dir"/supprt/rmlint/fixdup $t$mode elif "$script_dir"/supprt/rmlint/fixdup.sh /dev/null; then "$script_dir"/supprt/rmlint/fixdup.sh $t$mode else echo "Error, couldn't execute merge util" >&2 exit 1 fi else ( line='' declare -i counter #Use bash arithmetic, not expr (for speed) counter=0 while read; do # note we don't specify "file" to `read` # as otherwise trailing IFS will be stripped file="$REPLY" if [ ! "$file" ]; then if [ ! -z "$line" ]; then echo "$counter * $line" fi counter=0 else if [ $counter -eq 0 ]; then line=$(du -B1 "$file") else line="$line $file" fi counter=counter+1 fi done if [ ! -z "$line" ]; then echo "$counter * $line" fi ) | sort -k3,3 -k1,1 -brn fi else cat fi