#!/usr/bin/perl
# k-nearest neighbor on HOL Light data, the supposed format are two files
# each line in each starting with the theorem name followed by colon
# the symbols files then contains space-separated symbols for each theorem
# and the reference file space-separated references for each theorem
#
# The constrfile can contain more theorems than the reffile, but every
# theorem in reffile has to have its symbols described in constrfile.
#
# SYNOPSIS:
# mptp@mizar:/local/data/mptp/ec/HolyHammer/Advisor/holdata1/tests/hd5$ for i in t_notrivsyms; do time perl -F ./knn.pl syms$i combdeps.m10u_all_atponly 40 1 3 $i"_m10u_kn40_3_all_atponly" >holdata1.snow.all$i"_m10u_kn40__3_all_atponly"; done &
#
#
# NOTE: Do we use the fact that a premise can prove itself?


=head1 NAME

knn.pl (k-nearest neighbor on HOL Light data)

=head1 SYNOPSIS

knn.pl [options] constrfile reffile knn [distweight normrefs ext]


 Options:
   --dumpfile=<arg>,      -d<arg>
   --loadfile=<arg>     -l<arg>
   --server=<arg>,        -s<arg>
   --help,                  -h
   --man

=head1 OPTIONS

=over 8

TODO

=cut

use strict;
use Pod::Usage;
use Getopt::Long;
use IO::Socket;

my $gdumpfile; # file where the final state is optionally dumped
my $gloadfile; # dumped file to load
my $gtestfile; # means non-incremental mode
my $goverlapfile; # do not compute overlaps, take them from this file (implies gtestfile, same line number)
my $glimit; # limit the number of printed hints
my $gweighting; # 1 (default) is log, 2 is linear inverse, 3 is quadratic inverse
my $gsnowout; # output as snow does (used by malarea)
my $gmany; # if > 0, the knn is ':' separated string of ints, the output prefixes KNN$N for each
my $gserverport; # run as server on this port
my ($help, $man);

Getopt::Long::Configure ("bundling");

GetOptions('dumpfile|d=s'    => \$gdumpfile,
	   'loadfile|l=s'    => \$gloadfile,
	   'testfile|t=s'    => \$gtestfile,
	   'overlapfile|o=s'    => \$goverlapfile,
	   'limit|L=i'    => \$glimit,
	   'weighting|w=i'    => \$gweighting,
	   'snowout|S=i'   => \$gsnowout,
	   'many|M=i'      => \$gmany,
	   'server|s=i'    => \$gserverport,
	   'help|h'          => \$help,
	   'man'             => \$man)
    or pod2usage(2);

pod2usage(1) if($help);
pod2usage(-exitstatus => 0, -verbose => 2) if($man);

pod2usage(2) if ($#ARGV < 2);

my $constrfile = shift;
my $reffile = shift;
my $knn = shift; # number of nearest numbers
my $distweight = shift; # 1 by default: use the overlap for weighting the results; if 0, the $knn neighbors  are aggregated equally
my $normrefs = shift; # 0 by default; if 1, the refs of each neighbor are weighted to add to one; if 2, the neighbor has half its weight; if 3, the neighbor has 1 and the rest add to one
my $ext = shift;  # optional extension for the refnr and symnr files


# die "at least three arguments expected" unless defined($constrfile) && defined($reffile) && defined($knn);

$distweight = 1 unless(defined($distweight));
$ext = defined($ext) ? ".$ext" : '';

$gdumpfile = '' unless(defined($gdumpfile));
$gloadfile = '' unless(defined($gloadfile));
$gtestfile = '' unless(defined($gtestfile));
$goverlapfile = '' unless(defined($goverlapfile));
$glimit = 0 unless(defined($glimit));
$gweighting = 1 unless(defined($gweighting));
$gsnowout = 0 unless(defined($gsnowout));
$gmany = 0 unless(defined($gmany));
$gserverport = 0 unless(defined($gserverport));

# change for verbose logging
sub LOGGING { 0 };

my @gmanyknn = ();

if($gmany > 0)
{
    @gmanyknn = split(':', $knn);
}

#die 'syms and refs different' if(`wc -l constrs` != `wc -l refs`);

# hash for features of formulas, we no longer make assumptions
# about the order in FEATURES

my %featmp = (); 
my @feat_arr = (); # names of flas in the feature file as they come

my @cn_arr = (); # symbol names as they come
my %cn_nums = (); 
my @cn_freq = (); # frequencues
my $cn_count = 0; # number of all features in all flas (with repetition)
my @cn_wght = (); # weights = frequencies/cn_count

if($gloadfile ne '')
{
    use Storable;
    my $ftmp = retrieve($gloadfile . '.ftmp');
    die "Unable to load from $gloadfile.ftmp" unless defined $ftmp;
    %featmp = %{$ftmp};

    my $cwght = retrieve($gloadfile . '.wght');
    die "Unable to load from $gloadfile.cwght" unless defined $cwght;
    @cn_wght = @{$cwght};

}
else 
{
# read the features, put them as hashes in featmp
    open(FEATURES, $constrfile) or die;
    while(<FEATURES>)
    {
	chop($_);
	$_ =~ m/^([^:]+):(.*)/ or die "bad line $_";
	my ($nm,$fla) = ($1,$2);
	my %features1 = ();
	my $numfeat = {};

	@features1{ (split(/, +/,$fla)) } = ();
	foreach my $cn (keys %features1)
	{
	    $cn_count++;
	    if( !(exists $cn_nums{$cn})) 
	    {
		push @cn_arr, $cn;
		push @cn_freq, 1;
		$cn_nums{$cn} = $#cn_arr;
	    }
	    else  # frequency
	    {
		$cn_freq[$cn_nums{$cn}]++;
	    }
	    $numfeat->{$cn_nums{$cn}} = ();
	}
	$featmp{$nm}=$numfeat;
	push( @feat_arr, $nm);
    }
    close(FEATURES);

# normalize weights

# this is not much useful
#    @cn_wght = map { $_ / $cn_count } @cn_freq;

# this is quite a bit better: the symbols with lower frequency are preferred 
#    @cn_wght = map { 1 / (1 + $_)} @cn_freq;

# generally worse (interesting) than above, but on some spots can get slightly better
    my $gdoccnt = scalar (keys %featmp);

    if($gweighting == 1)
    {
	@cn_wght = map { log( $gdoccnt / $_ ) } @cn_freq;
    }
    elsif($gweighting == 2)
    {
	@cn_wght = map { 1 / (1 + $_)} @cn_freq;
    }
    elsif($gweighting == 3)
    {
	@cn_wght = map { 1 / ((1 + $_)*(1 + $_))} @cn_freq;
    }
    else
    {
	@cn_wght = map { log( $gdoccnt / $_ ) } @cn_freq;
    }
}


if($gdumpfile ne '')
{
    use Storable;
    store(\%featmp, $gdumpfile . '.ftmp');
#    die "Unable to load from $gloadfile.ftmp" unless defined $ftmp;
#    %featmp = %{$ftmp};

    store(\@cn_wght, $gdumpfile . '.wght');
#    die "Unable to load from $gloadfile.cwght" unless defined $cwght;
#    @cn_wght = @{$cwght};

}


# maximum squared distance between a vector with all 1 and null vector
# not needed for anything now
my $gmaxsqdistance = 0;
foreach my $i (@cn_wght) { $gmaxsqdistance += $i * $i; }


# this is not the standard euclidean distance
# the bigger it is, the better
# we do not penalize fhash2 for non-overlapping features
sub WeightedOverlap
{
    my ($farr1, $fhash2) = @_;
    my $d = 0;

    foreach my $fnr (@$farr1)
    {
	$d += $cn_wght[$fnr] * $cn_wght[$fnr] if(exists($fhash2->{$fnr}));
    }

    return sqrt($d); 
}

sub WeightedOverlapRelat
{
    my ($farr1, $fhash2) = @_;
    my $d = 0;

    # using $n1 has no point: it is the same for the given example
    # my $n1 = 1 / scalar @$farr1;
    my $n2 = 1 / scalar (keys %$fhash2);

    foreach my $fnr (@$farr1)
    {
	$d += $cn_wght[$fnr] * $cn_wght[$fnr]  if(exists($fhash2->{$fnr}));
    }

    return sqrt($d)*sqrt($n2); 
}


# this takes into account also features that are not present in $farr1, but exist in $fhash2
# this is really a distance metric, so the lower the better
# It will probably behave worse than overlap, because the features are
# now typically generated mercilessly (eg a deep unusual term will
# generate many unusual features).
sub WeightedDistance
{
    my ($farr1, $fhash2) = @_;
    my $d = 0;
    my %overlap = ();

    foreach my $fnr (@$farr1)
    {
	if(exists($fhash2->{$fnr}))
	{
	    $overlap{$fnr} = ();
	}
	else 
	{ 
	    $d +=  ($cn_wght[$fnr] * $cn_wght[$fnr]);
	}
    }

    foreach my $fnr (keys %$fhash2)
    {
	$d +=  ($cn_wght[$fnr] * $cn_wght[$fnr]) unless(exists($overlap{$fnr}));
    }
    
    return sqrt($d); 
}


open(REFS, $reffile) or die;
my @namearr = (); # theorem names as they come
my %namenums = (); 
my @namerefs = (); # for each refnr a hash of its references with weights

# compute aggregate coverage for evaluation here
my @gcover =();


if($gloadfile ne '')
{
    use Storable;
    my $nrefs = retrieve($gloadfile . '.nrefs');
    die "Unable to load from $gloadfile.nrefs" unless defined $nrefs;
    @namerefs = @{$nrefs};

    my $cwght = retrieve($gloadfile . '.wght');
    die "Unable to load from $gloadfile.cwght" unless defined $cwght;
    @cn_wght = @{$cwght};

}


# the knn code 
# find k nearest neighbors and their weighted overlap, weight their refs by their overlap weight
sub KNN
{
    my ($features) = @_;
    #my @features1 =  @$features;    #map { $cn_nums{$_} if(exists($cn_nums{$_})) } @res;

    my %overlap = ();
	
    foreach my $prevref (@namearr)
    {
	$overlap{$prevref} = WeightedOverlapRelat($features, $featmp{$prevref});
    }

    # sorted refs by their overlap, from greatest
    my @knns = sort { $overlap{$b} <=> $overlap{$a} } keys %overlap;


    if($gmany == 0)
    {

	# hash of the aggregated refs of the @knns, with aggregated weights
	my %knnrefs = ();

	foreach my $nb (@knns[0 .. $knn-1])
	{
	    my $refwght = ($distweight == 1)? $overlap{$nb} : 1;
	    my $nbrefs = $namerefs[$namenums{$nb}];
	    
	    foreach my $nbref (keys %$nbrefs)
	    {
		$knnrefs{$nbref} += $refwght * $nbrefs->{$nbref};
	    }
	}    

	my @sortedrfs = sort { $knnrefs{$b} <=> $knnrefs{$a} } keys %knnrefs;

	return (\@sortedrfs,\%knnrefs);
    }
    # the returned thing is an array of the arrays for each of the @gmanyknn
    else
    {
	my @res = ();
	foreach my $k (@gmanyknn)
	{
	    my %knnrefs = ();

	    foreach my $nb (@knns[0 .. $k-1])
	    {
		my $refwght = ($distweight == 1)? $overlap{$nb} : 1;
		my $nbrefs = $namerefs[$namenums{$nb}];
	    
		foreach my $nbref (keys %$nbrefs)
		{
		    $knnrefs{$nbref} += $refwght * $nbrefs->{$nbref};
		}
	    }    

	    my @sortedrfs = sort { $knnrefs{$b} <=> $knnrefs{$a} } keys %knnrefs;
	    push(@res, [@sortedrfs]);

	}
	return (\@res, {});
    }


#    here ends the knn code (duplicated from above)
}


# FEATURES can now contain more entries than REFS, because some
# conjunctions are removed from refs (replaced totally by their
# conjuncts). The order should be preserved in both files.
while(<REFS>)
{
    my ($cn, $name1, $fla, $ref, $def);

    chop($_);
    $_ =~ m/^([^:]+):(.*)/ or die "bad line $_";
    my ($name, $rfs) = ($1, $2);
    my $rfs = $2;

    if(exists($featmp{$name}))
    {
	$fla = $featmp{$name};
    }
    else { die "no features for $name"; }

    # find k nearest neighbos and their weighted overlap, weight their refs by their overlap weight
    # Don't do this in the server mode or when we have a testfile - we just need to load
    if(($gserverport == 0) && ($gtestfile eq ''))
    {
	my @features1 = keys %$fla;

	my ($sortedrfs, $knnrefs) = KNN(\@features1);
	if($gmany == 0)
	{
	    my $lim1 = ($glimit == 0)? scalar(@$sortedrfs) : $glimit;
	    print $name, ':', join(" ", @$sortedrfs[0..($lim1-1)]), "\n";
	}
	else
	{
	    foreach my $i (0..$#gmanyknn)
	    {
		my $res = $sortedrfs->[$i];
		my $k = $gmanyknn[$i];
		my $lim1 = ($glimit == 0)? scalar(@$res) : $glimit;
		print 'KNN',$k,':',$name, ':', join(" ", @$res[0..($lim1-1)]), "\n";
	    }
	}
    }

    # possibly weighted refs of the current one
    my $currefs;

    if( exists $namenums{$name})
    {
	$currefs = $namerefs[$namenums{$name}];
    }
    else
    {
	push @namearr, $name;
	$namenums{$name} = $#namearr;
	$currefs = {};
	$currefs->{$name} = 1;
	$namerefs[$#namearr] = $currefs;
    }

    my %refs1 = ();
    @refs1{ (split(/ /,$rfs)) } = ();
    
    my $refcnt = scalar(keys %refs1);

    if($refcnt > 0)
    {
	if($normrefs == 1) { $currefs->{$name} = 1/(1+$refcnt); }
	elsif($normrefs == 2) { $currefs->{$name} = 0.5 }
    }

    foreach $ref (keys %refs1)
    {
	my ($ref1,$w1) = ($ref,'');
	if($ref=~ m/(.*)([(].*[)])/) { ($ref1,$w1) = ($1,$2); $currefs->{$ref1} = $w1; }
	else { $currefs->{$ref1} = 1; }

	unless( exists $namenums{$ref1})
	{
	    push @namearr, $ref1;
	    $namenums{$ref1} = $#namearr;
	    $namerefs[$#namearr] = { $ref1 => 1 };
	}

	if($normrefs == 1) { $currefs->{$ref1} = $currefs->{$ref1}/(1+$refcnt) }
	elsif($normrefs == 2) { $currefs->{$ref1} = $currefs->{$ref1}/(2*$refcnt) }
	elsif($normrefs == 3) { $currefs->{$ref1} = $currefs->{$ref1}/$refcnt }

    }

    # update coverage - unfinished
    # foreach my $i (0 .. 50)
    # {
    # 	my $j= int(10 * $i);

    # 	foreach my $k (@sortedrfs)
    # 	{

    # 	$gcover[$i] += 
    # }

}

# don't do this in the snow mode
if($gsnowout == 0)
{
    open(OUT,">refnr$ext");
    map { print OUT "$_\n"; } @namearr;
    close(OUT);

    open(OUT,">symnr$ext");
    map { print OUT "$_\n"; } @cn_arr;
    close(OUT);
}

# eval on test data if given
if($gtestfile ne '')
{
    open(TEST, $gtestfile) or die;
    my $nr = 0;
    while(<TEST>)
    {
	$_ =~ m/^([^:]+):(.*)/ or die "bad line $_";

	my ($nm,$fla) = ($1,$2);
	my %features1 = ();

	@features1{ (split(/, +/,$fla)) } = ();

	my @feat1 = map { $cn_nums{$_} if(exists($cn_nums{$_})) } (keys %features1);

	my ($sortedrfs, $knnrefs) = KNN(\@feat1);
	my $lim1 = ($glimit == 0)? scalar(@$sortedrfs) : $glimit;

	if($gsnowout > 0)
	{
	    # $gmany not handled yet
	    die "Unhandled" if($gmany != 0);
	    $nr++;
	    print "Example $nr Label: $nm\n";
	    foreach my $ref (@$sortedrfs[0..($lim1-1)])
	    {
		print "$ref:\t", $knnrefs->{$ref}, "\n";
	    }
	    print "\n";
	    # print join(":\n", @$sortedrfs), ":\n\n"; 
	}
	else 
	{
	    if($gmany == 0)
	    {
		print $nm, ':', join(" ", @$sortedrfs[0..($lim1-1)]), "\n";
	    }
	    else
	    {
		foreach my $i (0..$#gmanyknn)
		{
		    my $res = $sortedrfs->[$i];
		    my $k = $gmanyknn[$i];
		    my $lim1 = ($glimit == 0)? scalar(@$res) : $glimit;
		    print 'KNN',$k,':',$nm, ':', join(" ", @$res[0..($lim1-1)]), "\n";
		}
	    }
	}
    }
    close(TEST);
}

# the server starts here
if($gserverport != 0)
{
    # $gmany not handled yet
    die "Unhandled" if($gmany != 0);

    my $server = IO::Socket::INET->new( Proto     => "tcp",
					LocalPort => $gserverport,
					Listen    => SOMAXCONN,
					Reuse     => 1);

    die "cannot setup server" unless $server;
    print "[Server $0 accepting clients]\n";

    while (my $client = $server->accept())
    {
	my ($sym,$msg,$msgout,$msgout1,$msg1,@msg2,@res,@res1,@res2);
	$client->autoflush(1);

	$msg = "";

	print "[accepted client]\n";
#    $msg  = ReceiveFrom($client);
#    while( $_=<$client> ) { $msg = $msg . $_; }
	$msg = <$client>;
	print $msg if(LOGGING);
	chop $msg;
	print "[received bytes]\n";
#    @res  = unpack("a", $msg);
	@res = split(/\,/, $msg);


#    here starts the knn code (duplicated from above)
	my @features1 = map { $cn_nums{$_} if(exists($cn_nums{$_})) } @res;

	my ($sortedrfs, $knnrefs) = KNN(\@features1);

	$msgout1 = join(",", @$sortedrfs);
#    send $client, pack("N", length $msgout), 0;
	print $client $msgout1;
	close $client;
	print "[closed client]\n";
    }
}