Search notes:

Perl script: find-equal-files.pl

find-equal-files.pl is a script that creates MD5 hashes of files in a directory and then finds equal files, even if they are named differently.
#!/usr/bin/perl
#
#   Find equal files based on MD5 Hash
#
use warnings;
use strict;

use Digest::MD5;
use File::Find;
use File::Temp;
use Getopt::Long;

my $rm_del_command = 'rm';

my $prio_file ='';
my $report_suffixes_seen = 0;
GetOptions('prio-file=s'   => \$prio_file, # ./find-equal-files.test/prio.file
           'suffixes-seen' => \$report_suffixes_seen);
use Cwd;

die "Priority file $prio_file does not exist" unless -f $prio_file;

my @prios;
read_prios();


#   9.3.2013: Only used temporarly, in order to find
#   out what file suffixes are present
use File::Basename;
my  %suffixes_seen;

#   ---

my  %md5_hex_hashes_seen;
my  %files_seen;

my  @directories_to_search = @ARGV;

my  $equal_files_seen = File::Temp->new();
print "equal_files_seen: $equal_files_seen\n";

open my $seen_files, '>', $equal_files_seen;
find (\&wanted, @directories_to_search);
close $seen_files;

if ($report_suffixes_seen) {
  for my $suffix (keys %suffixes_seen) {
    print "Suffix seen: $suffix\n";
  }
}

print "\n\n";

for my $md5_hex (keys %md5_hex_hashes_seen) { # {  print duplicate files:

    my @files_with_this_md5 = @{$md5_hex_hashes_seen{$md5_hex}};

    if (@files_with_this_md5 > 1) {

    #   print "$md5_hex\n";
    #   print "\n";

        my $after_first = 0;

        for my $file_with_this_md5 (sort sort_path_priority @files_with_this_md5) {
            
            (my $file_with_this_md5_ = $file_with_this_md5) =~ s,\/,\\,g;

            if ($after_first++) {
               print "$rm_del_command '$file_with_this_md5_'\n";
            }
            else {
               print "\n" . (' ' x (length($rm_del_command)+2)) . "$file_with_this_md5_\n";
            }
        }

    }

} # }

sub sort_path_priority { # {

  return path_priority($a) <=>
         path_priority($b);
} # }

sub path_priority { # {

  my $path = shift;

  for my $regexp (@prios) {

    return $regexp->{prio} if $path =~ /$regexp->{re}/;

  }

  return 99;

} # }

sub wanted { # {

    my $file_name = $File::Find::name;

#   print "$file_name   $_\n";

    return unless -f $_;

    unless (-s $_) {  # 11.12.2014: Skip empty files
      print "Warning!!!, $_ is empty\n";
      return;
    }

    my ($name, $path, $suffix) = fileparse ($file_name, qr"\..[^.]*$");

    return if exists $files_seen{"$path$name$suffix"};
    $files_seen {"$path$name$suffix"} = 1;

    $suffixes_seen{$suffix} = 1;

    my $md5_hex = get_md5_hex($_);

    push @{$md5_hex_hashes_seen{$md5_hex}}, $file_name;

#   printf $seen_files "%-30s %-15s %-6s %-30s\n", $path, $name, $suffix, $md5_hex;
    print  $seen_files "$md5_hex  $path$name$suffix\n";
#   print "$path$name$suffix\n";

} # }

sub get_md5_hex { # {

    my $file_name = shift;


    open (my $fh, '<', $file_name) or die;
    binmode $fh;
    
    my $md5 = new Digest::MD5;

    $md5->addfile($fh) or die "$@ $file_name";
    
    my $md5_hex = $md5->hexdigest;
    
    close $fh;

    return $md5_hex;
} # }

sub read_prios { # {

  open my $f, '<', $prio_file or die;

  while (my $line = <$f>) {
    chomp $line;

    if ($line =~ /^ *(\d+):(.+)/) {
       push @prios, {prio=>$1, re=>$2};
    }
  }

  close $f;

} # }
Github repository scripts-and-utilities, path: /find-equal-files.pl

See also

Scripts
Perl module Digest::MD5

Index