#! /usr/bin/perl
# vim: set filetype=perl:

# repeats.pl: Searches for duplicate files in the specified directories (just
# like repeats)

# Copyright (C) 2020 by Brian Lindholm.  This file is part of the littleutils
# utility set.
#
# The repeats utility is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3, or (at your option) any later version.
#
# The repeats utility is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# the littleutils.  If not, see <https://www.gnu.org/licenses/>.

### MODULES ###
use strict;
use warnings;
use Getopt::Std;
use Crypt::Digest::MD5;  # all of these require the CryptX module/package
use Crypt::Digest::SHA1;
use Crypt::Digest::SHA224;
use Crypt::Digest::SHA256;
use Crypt::Digest::SHA384;
use Crypt::Digest::SHA512;
use Crypt::Digest::BLAKE2b_256;
use Crypt::Digest::BLAKE2b_512;

### INPUT ARGUMENTS ###
our $opt_1 = ''; our $opt_a = 8; our $opt_h = ''; our $opt_l = '';
our $opt_m = 4096; our $opt_r = 4; our $opt_v = ''; our $opt_z = '';
my $goodopt = getopts('1a:hlm:r:vz');
# print help if requested or if bad options used, then quit
if ((not $goodopt) or $opt_h) {
  print "repeats LU_VERSION:\n";
  print "usage: repeats [-1(line)] [-a hash_algorithm] [-h(elp)] [-l(inks_hard)]\n";
  print "         [-m(idsize) bytecount] [-r ramp_rate] [-v(erbose)] [-z(eros)]\n";
  print "         [directory ...]\n";
  print "algorithms:  1 = MD5, 2 = SHA1, 3 = SHA224, 4 = SHA256, 5 = SHA384,\n";
  print "             6 = SHA512 (default), 7 = BLAKE2B-256, 8 = BLAKE2B-512\n";
  exit(0);
}
$opt_m = 4096 if ($opt_m < 1);
$opt_r = 4 if ($opt_r < 1);

### GLOBAL VARIABLES ###
my $BUFSIZE = 1024 * 1024; my $match_count = 0;
my %filedev = (); my %filehash = (); my %filenode = (); my %filesize = ();
my %digest_seen = (); my %size_seen = ();
my @results = ();

### LIST FUNCTIONS ###
# list with subsequent duplicates removed
sub uniq {
  my %seen = ();
  return grep { ! $seen{$_}++ } @_;
}
# list with subsequent hardlinks (matching inode number & dev number) removed
sub uniq_inode {
  my %seen = ();
  return grep { ! $seen{$filenode{$_} . $filedev{$_}}++ } @_;
}
# add list of matching files to results buffer
sub push_to_results {
  if ($opt_1) {
    push(@results, join("\t", @_));
  }
  else {
    for my $i (0 .. ($#_-1)) {
      push(@results, $_[$i] . "\t" . $_[$i+1]);
    }
  }
  $match_count += ($#_+1);
}

### FIND FUNCTION ###
# find files (excluding symlinks) and fetch stats
sub find_files {
  foreach my $dir (@_) {
    $dir =~ s/\/+$//;
    opendir(my $DIR, $dir) or die "repeats error: cannot open $dir\n";
    my @subdirs = ();
    while (defined(my $entry = readdir($DIR))) {
      next if (($entry eq '.') || ($entry eq '..'));
      my $fullname = (($#ARGV == -1) && ($dir eq '.')) ? $entry : $dir . '/' . $entry;
      my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size) = lstat($fullname);
      if ( -f _ ) {
        $filedev{$fullname} = $dev;
        $filenode{$fullname} = $ino;
        $filesize{$fullname} = $size;
        $size_seen{$size}++;
      }
      elsif ( -d _ ) {
        push(@subdirs, $fullname);
      }
    }
    close($DIR);
    find_files(@subdirs) if ($#subdirs > -1);
  }
}

### DIGEST FUNCTION ###
# grab partial file hash, skipping already-read bytes
sub grab_digest {
  my ($file, $bytes_read, $bytes_to_read) = @_; my $tmp;
  open(my $FILE, "<:raw", $file) or die "repeats error: unable to open $file\n";
  seek($FILE, $bytes_read, 0) if ($bytes_read > 0);
  if ($bytes_read > 0) { $tmp = $filehash{$file}->clone(); }  # clone required for BLAKE2b
  elsif ($opt_a == 8) { $tmp = Crypt::Digest::BLAKE2b_512->new; }
  elsif ($opt_a == 7) { $tmp = Crypt::Digest::BLAKE2b_256->new; }
  elsif ($opt_a == 6) { $tmp = Crypt::Digest::SHA512->new; }
  elsif ($opt_a == 5) { $tmp = Crypt::Digest::SHA384->new; }
  elsif ($opt_a == 4) { $tmp = Crypt::Digest::SHA256->new; }
  elsif ($opt_a == 3) { $tmp = Crypt::Digest::SHA224->new; }
  elsif ($opt_a == 2) { $tmp = Crypt::Digest::SHA1->new; }
  elsif ($opt_a == 1) { $tmp = Crypt::Digest::MD5->new; }
  else { die "repeats error: unsupported algorithm selected\n"; }
  do {
    my $rc = read($FILE, my $data, ($bytes_to_read > $BUFSIZE) ? $BUFSIZE : $bytes_to_read);
    die "repeats error: unable to read data from $file\n" unless (defined($rc));
    $tmp->add($data) if ($rc > 0);
    $bytes_to_read -= $rc;
    $bytes_read += $rc;
  } while (($bytes_read < $filesize{$file}) and ($bytes_to_read > 0));
  close($FILE);
  $filehash{$file} = $tmp->clone();  # clone required for BLAKE2b
  my $digest = $tmp->digest . $filesize{$file};
  ($bytes_read < $filesize{$file}) ? $digest_seen{$digest}-- : $digest_seen{$digest}++;
  return $digest;
}

### BEGIN MAIN PROGRAM ###
# traverse listed paths
my @search_paths = ($#ARGV > -1) ? grep { -d } uniq(@ARGV) : ('.');
find_files(@search_paths);
my @candidates = keys(%filesize);
printf STDERR "repeats stage 0: total number of files = %d\n", ($#candidates + 1) if ($opt_v);

### Optional STAGE 1 ###
if ($#candidates > 0) {
  # all zero-length files are the same
  my @zeros = grep { $filesize{$_} == 0 } @candidates;
  printf STDERR "repeats stage 1: num files with zero length = %d\n", ($#zeros + 1) if ($opt_v);
  push_to_results(@zeros) if (($opt_z) and ($#zeros > 0));
  # remove zero-length files from the list and re-sort
  @candidates = sort(grep { $filesize{$_} > 0 } @candidates);
}

### Optional STAGE 2 ###
if ((not $opt_l) and ($#candidates > 0)) {
  # remove files that use the same inode number as a previous file, leaving only the first
  @candidates = uniq_inode(@candidates);
  printf STDERR "repeats stage 2: num files excluding hardlinks = %d\n", ($#candidates + 1) if ($opt_v);
}

### STAGE 3 ###
# remove files with a unique filesize
if ($#candidates > 0) {
  @candidates = grep { $size_seen{$filesize{$_}} > 1 } @candidates;
  printf STDERR "repeats stage 3: num files with non-unique filesize = %d\n", ($#candidates + 1) if ($opt_v);
}

### STAGE 4 ###
# examine files with repeated sizes and add to results buffer if hashes match
my $bytes_read = 0; my $bytes_to_read = int($opt_m);
@candidates = sort { ($filedev{$a} <=> $filedev{$b}) || ($filenode{$a} <=> $filenode{$b}) } @candidates;
while ($#candidates > 0) {
  # grab digests for all candidates
  %digest_seen = ();
  my %digest = map { $_ => grab_digest($_, $bytes_read, $bytes_to_read) } @candidates;
  $bytes_read += $bytes_to_read;
  # write finished files to results buffer
  my @finished = grep { $digest_seen{$digest{$_}} > 1 } @candidates;
  if ($#finished > 0) {
    my @dup_digest_array = grep { $digest_seen{$_} > 1 } keys(%digest_seen);
    my %file_via_digest = ();
    foreach (@dup_digest_array) {
      @{$file_via_digest{$_}} = ();
    }
    foreach (@finished) {
      push (@{$file_via_digest{$digest{$_}}}, $_);
    }
    foreach (@dup_digest_array) {
      push_to_results(sort(@{$file_via_digest{$_}})) if ($#{$file_via_digest{$_}} > 0);
    }
  }
  # reduce list to unfinished files
  @candidates = grep { $digest_seen{$digest{$_}} < -1 } @candidates;
  printf STDERR "repeats stage 4: num files with matching digest after %d bytes = %d (%d remaining)\n",
    $bytes_read, $match_count, ($#candidates + 1) if ($opt_v);
  $bytes_to_read = int($bytes_to_read * $opt_r);
}

### FINAL RESULTS ###
foreach (sort(@results)) {
  print $_, "\n";
}
