#!/usr/bin/perl # k-nearest neighbor on HOL Light data, the supposed format are two files # each line in each starting with the theorem name followed by colon # the symbols files then contains space-separated symbols for each theorem # and the reference file space-separated references for each theorem # # The constrfile can contain more theorems than the reffile, but every # theorem in reffile has to have its symbols described in constrfile. # # SYNOPSIS: # mptp@mizar:/local/data/mptp/ec/HolyHammer/Advisor/holdata1/tests/hd5$ for i in t_notrivsyms; do time perl -F ./knn.pl syms$i combdeps.m10u_all_atponly 40 1 3 $i"_m10u_kn40_3_all_atponly" >holdata1.snow.all$i"_m10u_kn40__3_all_atponly"; done & # # # NOTE: Do we use the fact that a premise can prove itself? =head1 NAME knn.pl (k-nearest neighbor on HOL Light data) =head1 SYNOPSIS knn.pl [options] constrfile reffile knn [distweight normrefs ext] Options: --dumpfile=, -d --loadfile= -l --server=, -s --help, -h --man =head1 OPTIONS =over 8 TODO =cut use strict; use Pod::Usage; use Getopt::Long; use IO::Socket; my $gdumpfile; # file where the final state is optionally dumped my $gloadfile; # dumped file to load my $gtestfile; # means non-incremental mode my $goverlapfile; # do not compute overlaps, take them from this file (implies gtestfile, same line number) my $glimit; # limit the number of printed hints my $gweighting; # 1 (default) is log, 2 is linear inverse, 3 is quadratic inverse my $gsnowout; # output as snow does (used by malarea) my $gmany; # if > 0, the knn is ':' separated string of ints, the output prefixes KNN$N for each my $gserverport; # run as server on this port my ($help, $man); Getopt::Long::Configure ("bundling"); GetOptions('dumpfile|d=s' => \$gdumpfile, 'loadfile|l=s' => \$gloadfile, 'testfile|t=s' => \$gtestfile, 'overlapfile|o=s' => \$goverlapfile, 'limit|L=i' => \$glimit, 'weighting|w=i' => \$gweighting, 'snowout|S=i' => \$gsnowout, 'many|M=i' => \$gmany, 'server|s=i' => \$gserverport, 'help|h' => \$help, 'man' => \$man) or pod2usage(2); pod2usage(1) if($help); pod2usage(-exitstatus => 0, -verbose => 2) if($man); pod2usage(2) if ($#ARGV < 2); my $constrfile = shift; my $reffile = shift; my $knn = shift; # number of nearest numbers my $distweight = shift; # 1 by default: use the overlap for weighting the results; if 0, the $knn neighbors are aggregated equally my $normrefs = shift; # 0 by default; if 1, the refs of each neighbor are weighted to add to one; if 2, the neighbor has half its weight; if 3, the neighbor has 1 and the rest add to one my $ext = shift; # optional extension for the refnr and symnr files # die "at least three arguments expected" unless defined($constrfile) && defined($reffile) && defined($knn); $distweight = 1 unless(defined($distweight)); $ext = defined($ext) ? ".$ext" : ''; $gdumpfile = '' unless(defined($gdumpfile)); $gloadfile = '' unless(defined($gloadfile)); $gtestfile = '' unless(defined($gtestfile)); $goverlapfile = '' unless(defined($goverlapfile)); $glimit = 0 unless(defined($glimit)); $gweighting = 1 unless(defined($gweighting)); $gsnowout = 0 unless(defined($gsnowout)); $gmany = 0 unless(defined($gmany)); $gserverport = 0 unless(defined($gserverport)); # change for verbose logging sub LOGGING { 0 }; my @gmanyknn = (); if($gmany > 0) { @gmanyknn = split(':', $knn); } #die 'syms and refs different' if(`wc -l constrs` != `wc -l refs`); # hash for features of formulas, we no longer make assumptions # about the order in FEATURES my %featmp = (); my @feat_arr = (); # names of flas in the feature file as they come my @cn_arr = (); # symbol names as they come my %cn_nums = (); my @cn_freq = (); # frequencues my $cn_count = 0; # number of all features in all flas (with repetition) my @cn_wght = (); # weights = frequencies/cn_count if($gloadfile ne '') { use Storable; my $ftmp = retrieve($gloadfile . '.ftmp'); die "Unable to load from $gloadfile.ftmp" unless defined $ftmp; %featmp = %{$ftmp}; my $cwght = retrieve($gloadfile . '.wght'); die "Unable to load from $gloadfile.cwght" unless defined $cwght; @cn_wght = @{$cwght}; } else { # read the features, put them as hashes in featmp open(FEATURES, $constrfile) or die; while() { chop($_); $_ =~ m/^([^:]+):(.*)/ or die "bad line $_"; my ($nm,$fla) = ($1,$2); my %features1 = (); my $numfeat = {}; @features1{ (split(/, +/,$fla)) } = (); foreach my $cn (keys %features1) { $cn_count++; if( !(exists $cn_nums{$cn})) { push @cn_arr, $cn; push @cn_freq, 1; $cn_nums{$cn} = $#cn_arr; } else # frequency { $cn_freq[$cn_nums{$cn}]++; } $numfeat->{$cn_nums{$cn}} = (); } $featmp{$nm}=$numfeat; push( @feat_arr, $nm); } close(FEATURES); # normalize weights # this is not much useful # @cn_wght = map { $_ / $cn_count } @cn_freq; # this is quite a bit better: the symbols with lower frequency are preferred # @cn_wght = map { 1 / (1 + $_)} @cn_freq; # generally worse (interesting) than above, but on some spots can get slightly better my $gdoccnt = scalar (keys %featmp); if($gweighting == 1) { @cn_wght = map { log( $gdoccnt / $_ ) } @cn_freq; } elsif($gweighting == 2) { @cn_wght = map { 1 / (1 + $_)} @cn_freq; } elsif($gweighting == 3) { @cn_wght = map { 1 / ((1 + $_)*(1 + $_))} @cn_freq; } else { @cn_wght = map { log( $gdoccnt / $_ ) } @cn_freq; } } if($gdumpfile ne '') { use Storable; store(\%featmp, $gdumpfile . '.ftmp'); # die "Unable to load from $gloadfile.ftmp" unless defined $ftmp; # %featmp = %{$ftmp}; store(\@cn_wght, $gdumpfile . '.wght'); # die "Unable to load from $gloadfile.cwght" unless defined $cwght; # @cn_wght = @{$cwght}; } # maximum squared distance between a vector with all 1 and null vector # not needed for anything now my $gmaxsqdistance = 0; foreach my $i (@cn_wght) { $gmaxsqdistance += $i * $i; } # this is not the standard euclidean distance # the bigger it is, the better # we do not penalize fhash2 for non-overlapping features sub WeightedOverlap { my ($farr1, $fhash2) = @_; my $d = 0; foreach my $fnr (@$farr1) { $d += $cn_wght[$fnr] * $cn_wght[$fnr] if(exists($fhash2->{$fnr})); } return sqrt($d); } sub WeightedOverlapRelat { my ($farr1, $fhash2) = @_; my $d = 0; # using $n1 has no point: it is the same for the given example # my $n1 = 1 / scalar @$farr1; my $n2 = 1 / scalar (keys %$fhash2); foreach my $fnr (@$farr1) { $d += $cn_wght[$fnr] * $cn_wght[$fnr] if(exists($fhash2->{$fnr})); } return sqrt($d)*sqrt($n2); } # this takes into account also features that are not present in $farr1, but exist in $fhash2 # this is really a distance metric, so the lower the better # It will probably behave worse than overlap, because the features are # now typically generated mercilessly (eg a deep unusual term will # generate many unusual features). sub WeightedDistance { my ($farr1, $fhash2) = @_; my $d = 0; my %overlap = (); foreach my $fnr (@$farr1) { if(exists($fhash2->{$fnr})) { $overlap{$fnr} = (); } else { $d += ($cn_wght[$fnr] * $cn_wght[$fnr]); } } foreach my $fnr (keys %$fhash2) { $d += ($cn_wght[$fnr] * $cn_wght[$fnr]) unless(exists($overlap{$fnr})); } return sqrt($d); } open(REFS, $reffile) or die; my @namearr = (); # theorem names as they come my %namenums = (); my @namerefs = (); # for each refnr a hash of its references with weights # compute aggregate coverage for evaluation here my @gcover =(); if($gloadfile ne '') { use Storable; my $nrefs = retrieve($gloadfile . '.nrefs'); die "Unable to load from $gloadfile.nrefs" unless defined $nrefs; @namerefs = @{$nrefs}; my $cwght = retrieve($gloadfile . '.wght'); die "Unable to load from $gloadfile.cwght" unless defined $cwght; @cn_wght = @{$cwght}; } # the knn code # find k nearest neighbors and their weighted overlap, weight their refs by their overlap weight sub KNN { my ($features) = @_; #my @features1 = @$features; #map { $cn_nums{$_} if(exists($cn_nums{$_})) } @res; my %overlap = (); foreach my $prevref (@namearr) { $overlap{$prevref} = WeightedOverlapRelat($features, $featmp{$prevref}); } # sorted refs by their overlap, from greatest my @knns = sort { $overlap{$b} <=> $overlap{$a} } keys %overlap; if($gmany == 0) { # hash of the aggregated refs of the @knns, with aggregated weights my %knnrefs = (); foreach my $nb (@knns[0 .. $knn-1]) { my $refwght = ($distweight == 1)? $overlap{$nb} : 1; my $nbrefs = $namerefs[$namenums{$nb}]; foreach my $nbref (keys %$nbrefs) { $knnrefs{$nbref} += $refwght * $nbrefs->{$nbref}; } } my @sortedrfs = sort { $knnrefs{$b} <=> $knnrefs{$a} } keys %knnrefs; return (\@sortedrfs,\%knnrefs); } # the returned thing is an array of the arrays for each of the @gmanyknn else { my @res = (); foreach my $k (@gmanyknn) { my %knnrefs = (); foreach my $nb (@knns[0 .. $k-1]) { my $refwght = ($distweight == 1)? $overlap{$nb} : 1; my $nbrefs = $namerefs[$namenums{$nb}]; foreach my $nbref (keys %$nbrefs) { $knnrefs{$nbref} += $refwght * $nbrefs->{$nbref}; } } my @sortedrfs = sort { $knnrefs{$b} <=> $knnrefs{$a} } keys %knnrefs; push(@res, [@sortedrfs]); } return (\@res, {}); } # here ends the knn code (duplicated from above) } # FEATURES can now contain more entries than REFS, because some # conjunctions are removed from refs (replaced totally by their # conjuncts). The order should be preserved in both files. while() { my ($cn, $name1, $fla, $ref, $def); chop($_); $_ =~ m/^([^:]+):(.*)/ or die "bad line $_"; my ($name, $rfs) = ($1, $2); my $rfs = $2; if(exists($featmp{$name})) { $fla = $featmp{$name}; } else { die "no features for $name"; } # find k nearest neighbos and their weighted overlap, weight their refs by their overlap weight # Don't do this in the server mode or when we have a testfile - we just need to load if(($gserverport == 0) && ($gtestfile eq '')) { my @features1 = keys %$fla; my ($sortedrfs, $knnrefs) = KNN(\@features1); if($gmany == 0) { my $lim1 = ($glimit == 0)? scalar(@$sortedrfs) : $glimit; print $name, ':', join(" ", @$sortedrfs[0..($lim1-1)]), "\n"; } else { foreach my $i (0..$#gmanyknn) { my $res = $sortedrfs->[$i]; my $k = $gmanyknn[$i]; my $lim1 = ($glimit == 0)? scalar(@$res) : $glimit; print 'KNN',$k,':',$name, ':', join(" ", @$res[0..($lim1-1)]), "\n"; } } } # possibly weighted refs of the current one my $currefs; if( exists $namenums{$name}) { $currefs = $namerefs[$namenums{$name}]; } else { push @namearr, $name; $namenums{$name} = $#namearr; $currefs = {}; $currefs->{$name} = 1; $namerefs[$#namearr] = $currefs; } my %refs1 = (); @refs1{ (split(/ /,$rfs)) } = (); my $refcnt = scalar(keys %refs1); if($refcnt > 0) { if($normrefs == 1) { $currefs->{$name} = 1/(1+$refcnt); } elsif($normrefs == 2) { $currefs->{$name} = 0.5 } } foreach $ref (keys %refs1) { my ($ref1,$w1) = ($ref,''); if($ref=~ m/(.*)([(].*[)])/) { ($ref1,$w1) = ($1,$2); $currefs->{$ref1} = $w1; } else { $currefs->{$ref1} = 1; } unless( exists $namenums{$ref1}) { push @namearr, $ref1; $namenums{$ref1} = $#namearr; $namerefs[$#namearr] = { $ref1 => 1 }; } if($normrefs == 1) { $currefs->{$ref1} = $currefs->{$ref1}/(1+$refcnt) } elsif($normrefs == 2) { $currefs->{$ref1} = $currefs->{$ref1}/(2*$refcnt) } elsif($normrefs == 3) { $currefs->{$ref1} = $currefs->{$ref1}/$refcnt } } # update coverage - unfinished # foreach my $i (0 .. 50) # { # my $j= int(10 * $i); # foreach my $k (@sortedrfs) # { # $gcover[$i] += # } } # don't do this in the snow mode if($gsnowout == 0) { open(OUT,">refnr$ext"); map { print OUT "$_\n"; } @namearr; close(OUT); open(OUT,">symnr$ext"); map { print OUT "$_\n"; } @cn_arr; close(OUT); } # eval on test data if given if($gtestfile ne '') { open(TEST, $gtestfile) or die; my $nr = 0; while() { $_ =~ m/^([^:]+):(.*)/ or die "bad line $_"; my ($nm,$fla) = ($1,$2); my %features1 = (); @features1{ (split(/, +/,$fla)) } = (); my @feat1 = map { $cn_nums{$_} if(exists($cn_nums{$_})) } (keys %features1); my ($sortedrfs, $knnrefs) = KNN(\@feat1); my $lim1 = ($glimit == 0)? scalar(@$sortedrfs) : $glimit; if($gsnowout > 0) { # $gmany not handled yet die "Unhandled" if($gmany != 0); $nr++; print "Example $nr Label: $nm\n"; foreach my $ref (@$sortedrfs[0..($lim1-1)]) { print "$ref:\t", $knnrefs->{$ref}, "\n"; } print "\n"; # print join(":\n", @$sortedrfs), ":\n\n"; } else { if($gmany == 0) { print $nm, ':', join(" ", @$sortedrfs[0..($lim1-1)]), "\n"; } else { foreach my $i (0..$#gmanyknn) { my $res = $sortedrfs->[$i]; my $k = $gmanyknn[$i]; my $lim1 = ($glimit == 0)? scalar(@$res) : $glimit; print 'KNN',$k,':',$nm, ':', join(" ", @$res[0..($lim1-1)]), "\n"; } } } } close(TEST); } # the server starts here if($gserverport != 0) { # $gmany not handled yet die "Unhandled" if($gmany != 0); my $server = IO::Socket::INET->new( Proto => "tcp", LocalPort => $gserverport, Listen => SOMAXCONN, Reuse => 1); die "cannot setup server" unless $server; print "[Server $0 accepting clients]\n"; while (my $client = $server->accept()) { my ($sym,$msg,$msgout,$msgout1,$msg1,@msg2,@res,@res1,@res2); $client->autoflush(1); $msg = ""; print "[accepted client]\n"; # $msg = ReceiveFrom($client); # while( $_=<$client> ) { $msg = $msg . $_; } $msg = <$client>; print $msg if(LOGGING); chop $msg; print "[received bytes]\n"; # @res = unpack("a", $msg); @res = split(/\,/, $msg); # here starts the knn code (duplicated from above) my @features1 = map { $cn_nums{$_} if(exists($cn_nums{$_})) } @res; my ($sortedrfs, $knnrefs) = KNN(\@features1); $msgout1 = join(",", @$sortedrfs); # send $client, pack("N", length $msgout), 0; print $client $msgout1; close $client; print "[closed client]\n"; } }