#! /usr/bin/perl
# vim: set filetype=perl:

# repeats_clone.pl: Searches for duplicate files in the specified directories
# (just like repeats)

# Copyright (C) 2020 by Brian Lindholm.  This file is part of the littleutils
# utility set.
#
# The repeats utility is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3, or (at your option) any later version.
#
# The repeats utility is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# the littleutils.  If not, see <https://www.gnu.org/licenses/>.

### MODULES ###
use strict;
use warnings;
use Crypt::Digest::MD5 qw(md5);  # all of these require the CryptX module/package
use Crypt::Digest::SHA1 qw(sha1);
use Crypt::Digest::SHA224 qw(sha224);
use Crypt::Digest::SHA256 qw(sha256);
use Crypt::Digest::SHA384 qw(sha384);
use Crypt::Digest::SHA512 qw(sha512);
use Crypt::Digest::BLAKE2b_256 qw(blake2b_256);
use Crypt::Digest::BLAKE2b_512 qw(blake2b_512);
use File::Compare;
use File::Find;
use Getopt::Std;

### LIST FUNCTIONS ###
# list with subsequent duplicates removed, like "uniq" of List::Util or List::MoreUtils or List::SomeUtils
sub uniq {
  my %seen = ();
  return grep { ! $seen{$_}++ } @_;
}
# items present only once in a list, like "singleton" of List::Moreutils or List::SomeUtils
sub singleton {
  my %seen = ();
  my @uniq = grep { ! $seen{$_}++ } @_;
  return grep { $seen{$_} == 1 } @uniq;
}
# items present more than once in a list, like "duplicates" of List::Moreutils
sub duplicates {
  my %seen = ();
  return grep { $seen{$_}++ == 1 } @_;
}
# print an array (for debugging)
sub print_list {
  foreach my $item (@_) {
    print $item, "\n";
  }
  print "\n";
}

### DIGEST FUNCTIONS ###
# complete file hash
sub complete_filehash {
  my $filename = shift;
  my $algorithm = shift;
  open(my $FILE, "<:raw", $filename) or die "repeats error: unable to open $filename\n";
  my $tmp; my $digest;
  if ($algorithm == 1) {
    $tmp = Crypt::Digest::MD5->new;
  }
  elsif ($algorithm == 2) {
    $tmp = Crypt::Digest::SHA->new(1);
  }
  elsif ($algorithm == 3) {
    $tmp = Crypt::Digest::SHA->new(224);
  }
  elsif ($algorithm == 4) {
    $tmp = Crypt::Digest::SHA->new(256);
  }
  elsif ($algorithm == 5) {
    $tmp = Crypt::Digest::SHA->new(384);
  }
  elsif ($algorithm == 6) {
    $tmp = Crypt::Digest::SHA->new(512);
  }
  elsif ($algorithm == 7) {
    $tmp = Crypt::Digest::BLAKE2b_256->new;
  }
  elsif ($algorithm == 8) {
    $tmp = Crypt::Digest::BLAKE2b_512->new;
  }
  else {
    die "repeats error: unsupported algorithm selected\n";
  }
  $tmp->addfile($FILE);
  close($FILE);
  return $tmp->digest;
}
# partial file hash
sub partial_filehash {
  my $filename = shift;
  my $algorithm = shift;
  my $bytes = shift;
  open(my $FILE, "<:raw", $filename) or die "repeats error: unable to open $filename\n";
  my $rc = read($FILE, my $data, $bytes);
  die "repeats error: unable to read data from $filename\n" unless (defined($rc));
  close($FILE);
  if ($algorithm == 1) {
    return md5($data);
  }
  elsif ($algorithm == 2) {
    return sha1($data);
  }
  elsif ($algorithm == 3) {
    return sha224($data);
  }
  elsif ($algorithm == 4) {
    return sha256($data);
  }
  elsif ($algorithm == 5) {
    return sha384($data);
  }
  elsif ($algorithm == 6) {
    return sha512($data);
  }
  elsif ($algorithm == 7) {
    return blake2b_256($data);
  }
  elsif ($algorithm == 8) {
    return blake2b_512($data);
  }
  else {
    die "repeats error: unsupported algorithm selected\n";
  }
}

### MAIN PROGRAM ###
# get input arguments
our $opt_a = 8; our $opt_d = ''; our $opt_h = ''; our $opt_l = '';
our $opt_m = 65536; our $opt_p = ''; our $opt_v = ''; our $opt_z = '';
my $goodopt = getopts('a:dhlm:pvz');

# print help if requested or if bad options used, then quit
if ((not $goodopt) or $opt_h) {
  print "repeats LU_VERSION:\n";
  print "usage: repeats [-a hash_algorithm] [-d(ebug)] [-h(elp)] [-l(inks_hard)]\n";
  print "         [-m(idsize) bytecount] [-p(aranoid)] [-v(erbose)] [-z(eros)]\n";
  print "         [directory ...]\n";
  print "algorithms:  1 = MD5, 2 = SHA1, 3 = SHA224, 4 = SHA256, 5 = SHA384,\n";
  print "             6 = SHA512 (default), 7 = BLAKE2B-256, 8 = BLAKE2B-512\n";
  exit(0);
}

# define subroutines for find and inode search
my @candidate_list = ();
my %filesize = ();
my %filenode = ();
sub wanted_file {
  if ( ! -l && -f _ ) {
    $filenode{$File::Find::name} = (stat _)[1];
    $filesize{$File::Find::name} = (stat _)[7];
    push(@candidate_list, $File::Find::name);
  }
}
sub uniq_inode {
  my %seen = ();
  return grep { ! $seen{$filenode{$_}}++ } @_;
}

# traverse listed paths
my @search_paths = ('.');
@search_paths = grep(-d, @ARGV) if ($#ARGV > -1);
File::Find::find({wanted => \&wanted_file}, @search_paths);
@candidate_list = uniq(@candidate_list);
DEBUG1: if ($opt_d ) { print "### initial list ###\n"; print_list(@candidate_list); }

### STAGE 1 ###
unless ($opt_z) {
  # remove zero-size files
  @candidate_list = grep($filesize{$_} > 0, @candidate_list);
  DEBUG2: if ($opt_d ) { print "### files with non-zero sizes ###\n"; print_list(@candidate_list); }
}
my @filesize_array = map { $filesize{$_} } @candidate_list;
my %filename_via_size = map { $filesize{$_} => $_ } @candidate_list;  # this clobbers, but we don't care
# find filesizes that are not repeated and eliminate from list
my @uniq_filesize_array = singleton(@filesize_array);
my %possible_dup = map { $_ => 1 } @candidate_list;
map($possible_dup{$filename_via_size{$_}} = 0, @uniq_filesize_array);
@candidate_list = grep($possible_dup{$_}, @candidate_list);
DEBUG3: if ($opt_d ) { print "### files with non-unique sizes ###\n"; print_list(@candidate_list); }
print STDERR sprintf("repeats message: num files with non-unique filesize = %d\n", $#candidate_list + 1) if ($opt_v);

unless ($opt_l) {
  ### Optional STAGE 2 ###
  # remove files that use the same inode number as a previous file, leaving only the first
  @candidate_list = uniq_inode(@candidate_list);
  DEBUG4: if ($opt_d ) { print "### files with unique inode numbers ###\n"; print_list(@candidate_list); }
  print STDERR sprintf("repeats message: num files excluding hardlinks = %d\n", $#candidate_list + 1) if ($opt_v);
}

### STAGE 3 ###
# compute partial file hashes using digest functions
my %filehash = map { $_ => $filesize{$_} . "\t" . partial_filehash($_, $opt_a, $opt_m) } @candidate_list;
# remove files that have a unique partial filehash
my @filehash_array = map { $filehash{$_} } @candidate_list;
my %filename_via_filehash = map { $filehash{$_} => $_ } @candidate_list;  # this clobbers, but we don't care
my @uniq_filehash_array = singleton(@filehash_array);
%possible_dup = map { $_ => 1 } @candidate_list;
map { $possible_dup{$filename_via_filehash{$_}} = 0 } @uniq_filehash_array;
@candidate_list = grep($possible_dup{$_}, @candidate_list);
DEBUG5: if ($opt_d ) { print "### files with non-unique partial hashes ###\n"; print_list(@candidate_list); }
print STDERR sprintf("repeats message: num files with non-unique partial hash = %d\n", $#candidate_list + 1) if ($opt_v);

### STAGE 4 ###
# compute complete file hashes using digest functions
map($filehash{$_} = $filesize{$_} . "\t" . complete_filehash($_, $opt_a), grep($filesize{$_} > $opt_m, @candidate_list));
# remove files that have a unique complete filehash
@filehash_array = map { $filehash{$_} } @candidate_list;
%filename_via_filehash = map { $filehash{$_} => $_ } @candidate_list;  # this clobbers, but we don't care
@uniq_filehash_array = singleton(@filehash_array);
%possible_dup = map { $_ => 1 } @candidate_list;
map { $possible_dup{$filename_via_filehash{$_}} = 0 } @uniq_filehash_array;
@candidate_list = grep($possible_dup{$_}, @candidate_list);
DEBUG6: if ($opt_d ) { print "### files with non-unique complete hashes ###\n"; print_list(@candidate_list); }
print STDERR sprintf("repeats message: num files with non-unique complete hash = %d\n", $#candidate_list + 1) if ($opt_v);

### STAGE 4 ###
my @dup_filehash_array = duplicates(@filehash_array);
my %files_via_filehash = ();
foreach my $hash (@dup_filehash_array) {
  $files_via_filehash{$hash} = ();
}
if ($#ARGV > -1) {
  foreach my $file (sort(@candidate_list)) {
    push (@{$files_via_filehash{$filehash{$file}}}, $file);  # this is complex, but does not clobber
  }
}
else {
  foreach my $file (sort(@candidate_list)) {
    push(@{$files_via_filehash{$filehash{$file}}}, $file =~ s/^\.\///r);  # similar, but stripping leading "./"
  }
}
my @results = ();
if ($opt_p) {
  ### Optional STAGE 5 ###
  foreach my $hash (@dup_filehash_array) {
    for (my $i = 0; $i < $#{$files_via_filehash{$hash}}; $i++) {
      my $rc = compare($files_via_filehash{$hash}[$i], $files_via_filehash{$hash}[$i+1]);
      unless ($rc) {
        push(@results, $files_via_filehash{$hash}[$i] . "\t" . $files_via_filehash{$hash}[$i+1]);
      }
    }
  }
}
else {
  foreach my $hash (@dup_filehash_array) {
    for (my $i = 0; $i < $#{$files_via_filehash{$hash}}; $i++) {
      push(@results, $files_via_filehash{$hash}[$i] . "\t" . $files_via_filehash{$hash}[$i+1]);
    }
  }
}

### FINAL RESULTS ###
foreach my $rep (sort(@results)) {
  print $rep, "\n";
}
