This repository has been archived by the owner on Aug 15, 2020. It is now read-only.
forked from mit-medg/CliCon
-
Notifications
You must be signed in to change notification settings - Fork 103
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #6 from elensergwork/LSTM
Initial Update
- Loading branch information
Showing
12 changed files
with
2,243 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
token_pretrained_embedding_filepath vectors2.txt | ||
load_all_pretrained_token_embeddings False | ||
load_only_pretrained_token_embeddings False | ||
tagging_format bio | ||
use_character_lstm True | ||
use_crf True | ||
Use_LSTM True | ||
use_features_before_final_lstm False | ||
character_embedding_dimension 25 | ||
character_lstm_hidden_state_dimension 25 | ||
token_embedding_dimension 100 | ||
freeze_token_embeddings False | ||
token_lstm_hidden_state_dimension 100 | ||
optimizer sgd | ||
gradient_clipping_value 5.0 | ||
remap_unknown_tokens_to_unk True | ||
learning_rate 0.005 | ||
check_for_lowercase True | ||
check_for_digits_replaced_with_zeros True | ||
model_folder ./models/NN_models/Test_November | ||
conll_like_result_folder ./RESULTS/TEST_SAVER/NOVEMBER_DEBUG/ | ||
model_name model_00001.ckpt |
Large diffs are not rendered by default.
Oops, something went wrong.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,315 @@ | ||
#!/usr/bin/perl -w | ||
# conlleval: evaluate result of processing CoNLL-2000 shared task | ||
# usage: conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file | ||
# README: http://cnts.uia.ac.be/conll2000/chunking/output.html | ||
# options: l: generate LaTeX output for tables like in | ||
# http://cnts.uia.ac.be/conll2003/ner/example.tex | ||
# r: accept raw result tags (without B- and I- prefix; | ||
# assumes one word per chunk) | ||
# d: alternative delimiter tag (default is single space) | ||
# o: alternative outside tag (default is O) | ||
# note: the file should contain lines with items separated | ||
# by $delimiter characters (default space). The final | ||
# two items should contain the correct tag and the | ||
# guessed tag in that order. Sentences should be | ||
# separated from each other by empty lines or lines | ||
# with $boundary fields (default -X-). | ||
# url: http://lcg-www.uia.ac.be/conll2000/chunking/ | ||
# started: 1998-09-25 | ||
# version: 2004-01-26 | ||
# author: Erik Tjong Kim Sang <[email protected]> | ||
|
||
use strict; | ||
|
||
my $false = 0; | ||
my $true = 42; | ||
|
||
my $boundary = "-X-"; # sentence boundary | ||
my $correct; # current corpus chunk tag (I,O,B) | ||
my $correctChunk = 0; # number of correctly identified chunks | ||
my $correctTags = 0; # number of correct chunk tags | ||
my $correctType; # type of current corpus chunk tag (NP,VP,etc.) | ||
my $delimiter = " "; # field delimiter | ||
my $FB1 = 0.0; # FB1 score (Van Rijsbergen 1979) | ||
my $firstItem; # first feature (for sentence boundary checks) | ||
my $foundCorrect = 0; # number of chunks in corpus | ||
my $foundGuessed = 0; # number of identified chunks | ||
my $guessed; # current guessed chunk tag | ||
my $guessedType; # type of current guessed chunk tag | ||
my $i; # miscellaneous counter | ||
my $inCorrect = $false; # currently processed chunk is correct until now | ||
my $lastCorrect = "O"; # previous chunk tag in corpus | ||
my $latex = 0; # generate LaTeX formatted output | ||
my $lastCorrectType = ""; # type of previously identified chunk tag | ||
my $lastGuessed = "O"; # previously identified chunk tag | ||
my $lastGuessedType = ""; # type of previous chunk tag in corpus | ||
my $lastType; # temporary storage for detecting duplicates | ||
my $line; # line | ||
my $nbrOfFeatures = -1; # number of features per line | ||
my $precision = 0.0; # precision score | ||
my $oTag = "O"; # outside tag, default O | ||
my $raw = 0; # raw input: add B to every token | ||
my $recall = 0.0; # recall score | ||
my $tokenCounter = 0; # token counter (ignores sentence breaks) | ||
|
||
my %correctChunk = (); # number of correctly identified chunks per type | ||
my %foundCorrect = (); # number of chunks in corpus per type | ||
my %foundGuessed = (); # number of identified chunks per type | ||
|
||
my @features; # features on line | ||
my @sortedTypes; # sorted list of chunk type names | ||
|
||
# sanity check | ||
while (@ARGV and $ARGV[0] =~ /^-/) { | ||
if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); } | ||
elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); } | ||
elsif ($ARGV[0] eq "-d") { | ||
shift(@ARGV); | ||
if (not defined $ARGV[0]) { | ||
die "conlleval: -d requires delimiter character"; | ||
} | ||
$delimiter = shift(@ARGV); | ||
} elsif ($ARGV[0] eq "-o") { | ||
shift(@ARGV); | ||
if (not defined $ARGV[0]) { | ||
die "conlleval: -o requires delimiter character"; | ||
} | ||
$oTag = shift(@ARGV); | ||
} else { die "conlleval: unknown argument $ARGV[0]\n"; } | ||
} | ||
if (@ARGV) { die "conlleval: unexpected command line argument\n"; } | ||
# process input | ||
while (<STDIN>) { | ||
chomp($line = $_); | ||
@features = split(/$delimiter/,$line); | ||
if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; } | ||
elsif ($nbrOfFeatures != $#features and @features != 0) { | ||
printf STDERR "unexpected number of features: %d (%d)\n", | ||
$#features+1,$nbrOfFeatures+1; | ||
exit(1); | ||
} | ||
if (@features == 0 or | ||
$features[0] eq $boundary) { @features = ($boundary,"O","O"); } | ||
if (@features < 2) { | ||
die "conlleval: unexpected number of features in line $line\n"; | ||
} | ||
if ($raw) { | ||
if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } | ||
if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } | ||
if ($features[$#features] ne "O") { | ||
$features[$#features] = "B-$features[$#features]"; | ||
} | ||
if ($features[$#features-1] ne "O") { | ||
$features[$#features-1] = "B-$features[$#features-1]"; | ||
} | ||
} | ||
# 20040126 ET code which allows hyphens in the types | ||
if ($features[$#features] =~ /^([^-]*)-(.*)$/) { | ||
$guessed = $1; | ||
$guessedType = $2; | ||
} else { | ||
$guessed = $features[$#features]; | ||
$guessedType = ""; | ||
} | ||
pop(@features); | ||
if ($features[$#features] =~ /^([^-]*)-(.*)$/) { | ||
$correct = $1; | ||
$correctType = $2; | ||
} else { | ||
$correct = $features[$#features]; | ||
$correctType = ""; | ||
} | ||
pop(@features); | ||
# ($guessed,$guessedType) = split(/-/,pop(@features)); | ||
# ($correct,$correctType) = split(/-/,pop(@features)); | ||
$guessedType = $guessedType ? $guessedType : ""; | ||
$correctType = $correctType ? $correctType : ""; | ||
$firstItem = shift(@features); | ||
|
||
# 1999-06-26 sentence breaks should always be counted as out of chunk | ||
if ( $firstItem eq $boundary ) { $guessed = "O"; } | ||
|
||
if ($inCorrect) { | ||
if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and | ||
&endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and | ||
$lastGuessedType eq $lastCorrectType) { | ||
$inCorrect=$false; | ||
$correctChunk++; | ||
$correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? | ||
$correctChunk{$lastCorrectType}+1 : 1; | ||
} elsif ( | ||
&endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != | ||
&endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or | ||
$guessedType ne $correctType ) { | ||
$inCorrect=$false; | ||
} | ||
} | ||
|
||
if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and | ||
&startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and | ||
$guessedType eq $correctType) { $inCorrect = $true; } | ||
|
||
if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) { | ||
$foundCorrect++; | ||
$foundCorrect{$correctType} = $foundCorrect{$correctType} ? | ||
$foundCorrect{$correctType}+1 : 1; | ||
} | ||
if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) { | ||
$foundGuessed++; | ||
$foundGuessed{$guessedType} = $foundGuessed{$guessedType} ? | ||
$foundGuessed{$guessedType}+1 : 1; | ||
} | ||
if ( $firstItem ne $boundary ) { | ||
if ( $correct eq $guessed and $guessedType eq $correctType ) { | ||
$correctTags++; | ||
} | ||
$tokenCounter++; | ||
} | ||
|
||
$lastGuessed = $guessed; | ||
$lastCorrect = $correct; | ||
$lastGuessedType = $guessedType; | ||
$lastCorrectType = $correctType; | ||
} | ||
if ($inCorrect) { | ||
$correctChunk++; | ||
$correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? | ||
$correctChunk{$lastCorrectType}+1 : 1; | ||
} | ||
|
||
if (not $latex) { | ||
# compute overall precision, recall and FB1 (default values are 0.0) | ||
$precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); | ||
$recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); | ||
$FB1 = 2*$precision*$recall/($precision+$recall) | ||
if ($precision+$recall > 0); | ||
|
||
# print overall performance | ||
printf "processed $tokenCounter tokens with $foundCorrect phrases; "; | ||
printf "found: $foundGuessed phrases; correct: $correctChunk.\n"; | ||
if ($tokenCounter>0) { | ||
printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter; | ||
printf "precision: %6.2f%%; ",$precision; | ||
printf "recall: %6.2f%%; ",$recall; | ||
printf "FB1: %6.2f\n",$FB1; | ||
} | ||
} | ||
|
||
# sort chunk type names | ||
undef($lastType); | ||
@sortedTypes = (); | ||
foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) { | ||
if (not($lastType) or $lastType ne $i) { | ||
push(@sortedTypes,($i)); | ||
} | ||
$lastType = $i; | ||
} | ||
# print performance per chunk type | ||
if (not $latex) { | ||
for $i (@sortedTypes) { | ||
$correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; | ||
if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; } | ||
else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } | ||
if (not($foundCorrect{$i})) { $recall = 0.0; } | ||
else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } | ||
if ($precision+$recall == 0.0) { $FB1 = 0.0; } | ||
else { $FB1 = 2*$precision*$recall/($precision+$recall); } | ||
printf "%17s: ",$i; | ||
printf "precision: %6.2f%%; ",$precision; | ||
printf "recall: %6.2f%%; ",$recall; | ||
printf "FB1: %6.2f %d\n",$FB1,$foundGuessed{$i}; | ||
} | ||
} else { | ||
print " & Precision & Recall & F\$_{\\beta=1} \\\\\\hline"; | ||
for $i (@sortedTypes) { | ||
$correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; | ||
if (not($foundGuessed{$i})) { $precision = 0.0; } | ||
else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } | ||
if (not($foundCorrect{$i})) { $recall = 0.0; } | ||
else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } | ||
if ($precision+$recall == 0.0) { $FB1 = 0.0; } | ||
else { $FB1 = 2*$precision*$recall/($precision+$recall); } | ||
printf "\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\", | ||
$i,$precision,$recall,$FB1; | ||
} | ||
print "\\hline\n"; | ||
$precision = 0.0; | ||
$recall = 0; | ||
$FB1 = 0.0; | ||
$precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); | ||
$recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); | ||
$FB1 = 2*$precision*$recall/($precision+$recall) | ||
if ($precision+$recall > 0); | ||
printf "Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n", | ||
$precision,$recall,$FB1; | ||
} | ||
|
||
exit 0; | ||
|
||
# endOfChunk: checks if a chunk ended between the previous and current word | ||
# arguments: previous and current chunk tags, previous and current types | ||
# note: this code is capable of handling other chunk representations | ||
# than the default CoNLL-2000 ones, see EACL'99 paper of Tjong | ||
# Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 | ||
|
||
sub endOfChunk { | ||
my $prevTag = shift(@_); | ||
my $tag = shift(@_); | ||
my $prevType = shift(@_); | ||
my $type = shift(@_); | ||
my $chunkEnd = $false; | ||
|
||
if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; } | ||
if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; } | ||
if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; } | ||
if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } | ||
|
||
if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; } | ||
if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; } | ||
if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; } | ||
if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } | ||
|
||
if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { | ||
$chunkEnd = $true; | ||
} | ||
|
||
# corrected 1998-12-22: these chunks are assumed to have length 1 | ||
if ( $prevTag eq "]" ) { $chunkEnd = $true; } | ||
if ( $prevTag eq "[" ) { $chunkEnd = $true; } | ||
|
||
return($chunkEnd); | ||
} | ||
|
||
# startOfChunk: checks if a chunk started between the previous and current word | ||
# arguments: previous and current chunk tags, previous and current types | ||
# note: this code is capable of handling other chunk representations | ||
# than the default CoNLL-2000 ones, see EACL'99 paper of Tjong | ||
# Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 | ||
|
||
sub startOfChunk { | ||
my $prevTag = shift(@_); | ||
my $tag = shift(@_); | ||
my $prevType = shift(@_); | ||
my $type = shift(@_); | ||
my $chunkStart = $false; | ||
|
||
if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; } | ||
if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; } | ||
if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; } | ||
if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } | ||
|
||
if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; } | ||
if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; } | ||
if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; } | ||
if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } | ||
|
||
if ($tag ne "O" and $tag ne "." and $prevType ne $type) { | ||
$chunkStart = $true; | ||
} | ||
|
||
# corrected 1998-12-22: these chunks are assumed to have length 1 | ||
if ( $tag eq "[" ) { $chunkStart = $true; } | ||
if ( $tag eq "]" ) { $chunkStart = $true; } | ||
|
||
return($chunkStart); | ||
} |
Oops, something went wrong.