Thin-dup - Remove Duplicate Files
Join the DZone community and get the full member experience.
Join For FreeThis compares the md5sum of files and prompts to remove copies of files that are duplicated.
#!/usr/bin/perl -w
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#
# Program: thin-dup
# Version: 4.2
# Purpose: To find and confirm removal of duplicate copies of files
# in the current directory.
#
# Author: John Harrison
# Revision: 20 June 2003 4.0
# 19 July 2003 4.1
# 22 Nov 2004 4.2 Add die messages, verbose flag & tick.
#
# This is the fourth major re-write.
# The previous versions were all shell scripts.
# This one goes like lightning compared to them!
# It uses md5sum to compare any files it finds which are
# the same size and which have different inode numbers.
#
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
use strict;
use Getopt::Long;
use Digest::MD5 qw(md5_hex);
use Time::localtime;
use POSIX qw(:termios_h);
my $fd_stdin = fileno(STDIN);
my $term = POSIX::Termios->new();
$term->getattr($fd_stdin);
my $oterm = $term->getlflag();
my $echo = ECHO | ECHOK | ICANON;
my $noecho = $oterm & ~$echo;
sub cbreak {
$term->setlflag($noecho); # ok, so i don't want echo either
$term->setcc(VTIME, 1);
$term->setattr($fd_stdin, TCSANOW);
}
sub cooked {
$term->setlflag($oterm);
$term->setcc(VTIME, 0);
$term->setattr($fd_stdin, TCSANOW);
}
sub readkey {
my $key = '';
cbreak();
sysread(STDIN, $key, 1);
cooked();
return $key;
}
END { cooked() }
my (@files, @digests);
my (%sizes, %inodes, %files, %info, %digests, %copies, %duplicates);
#
# option variables with default value (false)
#
my $rm_f = 0;
my $rm_i = 0;
my $verb = 0;
my $tick = 0;
GetOptions (
'f+' => \$rm_f,
'y+' => \$rm_f,
'i+' => \$rm_i,
't+' => \$tick,
'v+' => \$verb
) || die "Usage: $0 -[f|y|i|v]\n";
unless (@files) {
opendir(D, ".") || die "Can't open directory: $!\n";
@files = sort grep {-f $_} grep { /./ } readdir(D);
}
my @months = ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec');
my @tick = ('|', '/', '-', '\\');
$| = 1;
for my $file (@files) {
next if (-l $file ); # skip symlinks
my ($dev,$inode,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks) = stat($file);
my $time = localtime($mtime);
$sizes{$size}++; # counting the files of this size
$inodes{$size}{$sizes{$size}} = $inode;
$files{$size}{$sizes{$size}} = $file;
$info{$file} = sprintf "%13i %3s %-2i %02i:%02i:%02i %4i",
$size, $months[$time->mon], $time->mday,
$time->hour, $time->min, $time->sec,
1900 + $time->year;
}
my $count;
for my $size (sort {$b <=> $a} keys %sizes) {
if ($sizes{$size} > 1) {
for my $i (1..$sizes{$size}) {
my ($inode, $file) = ($inodes{$size}{$i}, $files{$size}{$i});
# Don't bother to sum the same inode more than once
if (! $digests{$inode}) {
print STDERR "Summing: $file", $/ if ($verb);
if ($tick) {
print "\r", $tick[$count++];
$count = 0 if ($count eq 4);
}
open(FILE, $file) || die "Can't read $file: $!\n";
my $digest = md5_hex();
close(FILE);
$digests{$inode} = $digest;
push(@digests, $digest) unless($copies{$digest}++);
$duplicates{$digest}{$copies{$digest}} = $file;
}
}
}
}
for my $digest (@digests) {
if ($copies{$digest} > 1) {
my @duplicates;
for my $i (1..$copies{$digest}) {
my $file = $duplicates{$digest}{$i};
push (@duplicates, $file);
print "$info{$file} $file\n";
}
my $files = $#duplicates;
for my $file (@duplicates) {
my $key;
if ($rm_f) {
$key = "y";
} else {
$| = 1;
print "Remove '$file'? [y|N] ";
until (defined ($key = readkey())){};
chomp($key);
}
if ($key =~ /^y$/i) {
unlink($file);
print "removed `$file`\n";
last if ($files-- < 2);
} elsif ($key =~ /^\003$/i) {
die "\n";
} else {
print "\n";
}
}
}
}
Opinions expressed by DZone contributors are their own.
Comments