-
Notifications
You must be signed in to change notification settings - Fork 1
/
authors_compare_SIMPLE.pl
105 lines (92 loc) · 5.64 KB
/
authors_compare_SIMPLE.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!perl
BEGIN { use CGI::Carp qw(carpout); open(LOG, ">>_error2.txt") or die("Unable to open mycgi-log: $!\n"); carpout(LOG); }
BEGIN { $SIG{"__DIE__"} = $SIG{"__WARN__"} = sub { my $error = shift; chomp $error; $error =~ s/[<&>]/"&#".ord($&).";"/ge; print "Content-type: text/html\n\n$error\n"; exit 0; } }
# Copyright 2016-2020 Zawy, MIT license
# This program ranks how similar a "baseline text file is to files in a directory. Meant for words using "a to z" characters.
# It's based on finding least "entropy difference" which is probably very close to n-gram methods.
# It's my implementation of Kullback–Leibler divergence aka relative entropy.
# The algorithm is:
# Get the same number of words from each "known" author file to compare as long as they have at least
# as many words as "baseline" unknown author file. If a word is in baseline that is not in a "known" file, then
# give that word 0.25 value as if it occurred 1/4 of 1 time in the file.
# Let Ai = frequency of word "i" in baseline file, and Bi = freq of word i in comparison file.
# relative entropy = sum over all i { Ai*(log(Ai/Bi)) }
# Lowest relative entropy is closest match.
#
# Can work on 7KB files (1,000 words) if known files are 50KB. Both at 50KB is darn good.
# Accuracy, approx: 50% in 1st place given 20 authors in same genre with 50k files.
# Use SVMLink open software ranking capability for professional work.
$windows ='no';
if ($windows eq 'yes' ) { $blah='\\'; } # windows
else { $blah = '/'; } # linux
######## TELL PROGRAM WHAT IT NEEDS TO KNOW #########
open(F,"<author_ignore_words.txt") or $words_to_ignore=''; $words_to_ignore=join('',<F>); close F; # comment these 2 lines out if the word list is small and use the scalar below
$words_to_ignore=~s/\n/\|/g;
# $words_to_ignore= 'bitcoin|node|nodes|general|generals|byzantine|encrypted|hash| cryptographic|coin|cryptography|coins|encrypt|messages|port|cryptography|network|cash|currency|transaction|ecash|security|distributed|asics|cpu|fee|fees|rsa|ip|ec-dsa|prebitcoin|decentralized|decentralize|transactions|hashcash|anonymous|cypherpunks|satoshi|money|http|smtp|tcp|arpanet|proof-of-work|nakamoto|block|blocks|chain|blockchain|proof|work'; # for example bitcoin|blockchain|byzantine|
# $baselinefile='author_baseline.txt';
$baselinefile = 'unknown_author.txt'; # author's text for comparison to authors in $dir. Stays in same directory as this program
$baselinesize = -s $baselinefile; # get size of file in bytes
$buffer = 1.07; # $buffer=1.07 helps assure enough words are pulled in from known files, but it means the "known" files must be > 7% bigger
$min_file_size = $buffer*$baselinesize;
# $dir='authors_all'; # all files > 30% bigger than baseline file to make sure enough words are retireved.
$dir = 'known_authors';
$smallest =1000000000;
opendir(DIR, $dir) or die $!;
while ($file = readdir(DIR)) {
if ($min_file_size < -s ".$blah$dir$blah$file" and $file =~ m/\.txt$/) {
push(@files,$file);
if (-s ".$blah$dir$blah$file" < $smallest) { $smallest = -s ".$blah$dir$blah$file"; }
}
}
closedir(DIR);
$oversize = $smallest/$baselinesize/$buffer;
######## PRINT HTML HEADER #######
print"Content-type: text/html\n\n<html><H3>Author Comparison, and entropy of authors</H3>
$baselinefile is $baselinesize bytes. <BR>
known directory: $dir<BR>
Words to ignore: $words_to_ignore<BR>";
if ($smallest == 1000000000) { print "\n\n No file found in the $dir folder."; exit; }
####### RUN PROGRAM ######
open(F,"<$baselinefile") or die $!; read(F,$c,$baselinesize); close F;
%baseline_count=get_word_counts($c); # stores count (value) of each word (key).
print "Using first " . int($smallest) . " bytes of known files\nOversize value is $oversize\nBuffer is $buffer\n\n$ignore\n\n";
foreach $file (@files) {
open(F, "<.$blah$dir$blah$file") or die $!;
# open(GG, ">temp_out_$file") or die $!;
read(F,$c,$baselinesize*$oversize*$buffer); close F;
%known_count=get_word_counts($c);
foreach $word (keys %baseline_count) {
$m=$baseline_count{$word};
# begin core calculation
$k = $known_count{$word}/$oversize;
if ($known_count{$word} < 1 ) { $k=.25/$oversize; } # 0.25 was determined by experiment
$scores{$file}+=$m*log($m/$k); # Kullback–Leibler divergence aka relative entropy
# end core calculation
} # next word
$scores{$file} /= $total_words;
# close GG;
} # next file
###### FINISHED ----- PRINT RESULTS ##########
print "First $total_words words from baseline text above and known texts below were compared.<BR><BR>";
@ranked = sort {$scores{$a} <=> $scores{$b} } keys %scores;
foreach $file (@ranked) { $rank++;
print "$rank " . int($scores{$file}) . " = $file <br>";
}
exit;
######## SUBROUTINE #########
sub get_word_counts { $c=$_[0];
$c=lc $c; # ignore capitalization
$c=~s/'|`|´|’//g;
# $c=~s/[^a-z ,.:;'"?!\(|\)]/ /gs; $c=~s/(\.|,|\!|\?|;|:|'|\(|\))/ $1 /gs; # keep punctuation
$c=~s/[^a-z ]/ /gs; # no punctuation, no numbers
$c=~s/ +/ /gs;
if ($words_to_ignore ne '') {$c=~s/ $words_to_ignore / /gsi;}
@c=split(" ", $c);
if ($firsttime eq '') { $total_words=$#c; $firsttime='nope';}
else { $#c=$total_words*$oversize; }
undef %count;
foreach $c (@c) { $count{$c}++; $num_words{$file}++;} # get word counts
# foreach $c (@c) { $y=$z; $z=$c; $count{"$y $z"}++;$num_words{$file}++; } # as word pairs
# foreach $c (@c) { $x=$y; $y=$z; $z=$c; $count{"$x O $z"}++; $num_words{$file}++;} # missing middle
# foreach $c (@c) { $x=$y; $y=$z; $z=$c; $count{"$x $y $z"}++;$num_words{$file}++; } # triples
return %count; }