#!/usr/bin/perl
# $Id$

# Copyright (C) 1995-1997 by Zygo Blaxell <zblaxell@hungrycats.org>
# Use, modification, and distribution permitted
# under the terms of the GNU GPL.

$report_only=$ENV{'dupescan_report_only'};
$xdev=$report_only?"":"-xdev";

$*=1;
$/="\0";

($USER=getpwuid($<)) || die "Can't find your username ($<): $!";
$USER=$ENV{'dupescan_user'} || $USER;
$ENV{'dupescan_user'}='*' unless $>;
$userparam=$ENV{'dupescan_user'} eq '*' ? "" : ( $USER ? "-user $USER" : "-nouser");

if (open(FIND,"find $xdev $userparam -type f ! -empty -printf 'unknown %s %i %p\\0' |")) {
	while (<FIND>) {
		($sum,$size,$inode,$name)=m/^(\S+) (\d+) (\d+) (.+)\0$/;
		$inode_to_size{$inode}=$size;
		$inode_to_name{$inode}.="$name\0";
		$name_to_inode{$name}=$inode;
		$size_to_inode{$size}=join("\0",grep($_ != $inode,split(/\0/,$size_to_inode{$size})),$inode);
		$inode_to_sum{$inode}=$sum;
	}
}
close(FIND);

# Add previously generated checksums for old files

if (open(DB,"<.dupe1db")) {
	while (<DB>) {
		($sum,$size,$inode,$name)=m/^(\S+) (\d+) (\d+) (.+)\0$/;
		#print STDERR "sum=$sum, size=$size, inode=$inode, name=$name\n";
		next unless ($name_to_inode{$name}==$inode && 
				$inode_to_size{$inode}==$size &&
				$inode_to_sum{$inode} eq 'unknown');
		#print STDERR "(accepting)\n";
		$inode_to_sum{$inode}=$sum;
	}
}
close(DB);

if (open(DB,"<.dupe2db")) {
	while (<DB>) {
		($sum,$size,$inode,$otherinode)=m/^(\S+) (\d+) (\d+) (\d+)\0$/;
		#print STDERR "sum=$sum, size=$size, inode=$inode, otherinode=$otherinode\n";
		next unless ($inode_to_size{$inode}==$size &&
				($inode_to_sum{$inode} eq $sum));
		#print STDERR "(accepting)\n";
		$are_different{"$inode,$otherinode"}++;
	}
}
close(DB);

$saved_total=0;

inode:

foreach $size (sort { $b <=> $a } keys(%size_to_inode)) {

tryloop:

	until (0) {
		@inodelist=grep(defined($inode_to_size{$_}),split(/\0/,$size_to_inode{$size}));
		next inode unless $inodelist[1];
		print STDERR "Inodes of size $size:  @inodelist\n";
		foreach $inode (@inodelist) {
			if ($inode_to_sum{$inode} eq 'unknown') {
				$inode_to_sum{$inode}=&do_sum($inode_to_name{$inode});
			} else {
				print STDERR "Already know checksum of $inode is $inode_to_sum{$inode}\n";
			}
		}
		foreach $i (0..$#inodelist) {
			$ii=$inodelist[$i];
			(@fn1)=split(/\0/,$inode_to_name{$ii});
			foreach $j (($i+1)..$#inodelist) {
				$ij=$inodelist[$j];
				next if $inode_to_sum{$ii} ne $inode_to_sum{$ij};
				if ($are_different{"$ii,$ij"} || $are_different{"$ij,$ii"}) {
					print STDERR "Inodes $ii and $ij have identical checksum but are known to be different\n";
					next;
				}
				print STDERR "Inodes $ii and $ij have identical checksum\n";
				(@fn2)=split(/\0/,$inode_to_name{$ij});
				print STDERR "Comparing @fn1 with @fn2...";
				if (&do_cmp($fn1[0],$fn2[0])) {
					print STDERR "Different.\n";
					$are_different{"$ii,$ij"}++;
					next;
				} else {
					print STDERR "Identical.\n";
					$saved_total+=$size;
					if ($report_only) {
						print STDOUT "@fn1 and @fn2 are identical ($size bytes).\n";
					} else {
						if ( (@fn1)+0 > (@fn2)+0 ) {
							@temp=@fn1;
							$temp=$ii;
							@fn1=@fn2;
							$ii=$ij;
							@fn2=@temp;
							$ij=$temp;
						}
						foreach $target (@fn1) {
							foreach $source (@fn2) {
								unlink(".dupelink.$$");
								#print STDERR "link($source,.dupelink.$$)...";
								warn "link $source, .dupelink.$$: $!" unless link($source,".dupelink.$$");
								#print STDERR "rename(.dupelink.$$,$target)...";
								last if rename(".dupelink.$$",$target);
								warn "rename .dupelink.$$, $target: $!";
								unlink(".dupelink.$$") || die "Can't delete .dupelink.$$.  Exiting for safety reasons:  $!";
							}
							#print STDERR "\n";
						}
						system("chmod","-v","a-w,og+r",$fn1[0]);
						system("chown","-v","0.0",$fn1[0]) unless $>;
					}
					# mangle connectivity
					$inode_to_name{$ij}=join("\0",sort(@fn1,@fn2));
					delete($inode_to_name{$ii});
					delete($inode_to_sum{$ii});
					delete($inode_to_size{$ii});
					foreach $name (@fn1) {
						$name_to_inode{$name}=$ij;
					}
					$done=0;
					next tryloop;
				}
			}
		}
		last tryloop;
	}
}

open(DB,">.dupe1db") || die "open .dupe1db: $!";
foreach $inode (keys(%inode_to_sum)) {
	$size=$inode_to_size{$inode};
	$sum=$inode_to_sum{$inode};
	next if $sum eq 'unknown';
	foreach $name (split(/\0/,$inode_to_name{$inode})) {
		$name || next;
		print DB sprintf("%s %d %d %s\0",$sum,$size,$inode,$name);
	}
}
close(DB);

open(DB,">.dupe2db") || die "open .dupe2db: $!";
foreach $pair (keys(%are_different)) {
	($inode,$otherinode)=split(/,/,$pair);
	$size=$inode_to_size{$inode};
	$sum=$inode_to_sum{$inode};
	warn "This shouldn't happen!" if $sum eq 'unknown';
	print DB sprintf("%s %d %d %d\0",$sum,$size,$inode,$otherinode);
}
close(DB);

print STDERR "Database updated.  Done (saved $saved_total bytes)\n";

sub do_sum {
	($the_file=$_[0]) =~ s/\0.*$//;
	# Crude, but very effective
	unless (open(INSUM,"-|")) {
		open(STDIN,"<$the_file\0") || die "open $the_file: $!";
		exec("md5sum") || die "exec: $!";
	}
	chop($checksum=<INSUM>);
	$checksum =~ s/^([\da-fA-F]+).*$/$1/;
	unless (close(INSUM)) {
		warn "Checksum exited, status $?";
		$checknum++;
		$checksum="not-a-checksum-$checknum";
	}
	die "Checksum failed: got $checksum" unless $checksum =~ /[a-fA-F0-9]/o;
	print STDERR "Checksum of $the_file is $checksum\n";
	return $checksum;
}

sub do_cmp {
	($file1,$file2) = @_;
	$file1 =~ s/\0.*$//;
	$file2 =~ s/\0.*$//;

	# Crude, but effective.
	system("cmp","-s",$file1,$file2); 
	return $?;
}
