diff --git a/LSTM_parameters.txt b/LSTM_parameters.txt new file mode 100644 index 0000000..c9c4fcd --- /dev/null +++ b/LSTM_parameters.txt @@ -0,0 +1,22 @@ +token_pretrained_embedding_filepath vectors2.txt +load_all_pretrained_token_embeddings False +load_only_pretrained_token_embeddings False +tagging_format bio +use_character_lstm True +use_crf True +Use_LSTM True +use_features_before_final_lstm False +character_embedding_dimension 25 +character_lstm_hidden_state_dimension 25 +token_embedding_dimension 100 +freeze_token_embeddings False +token_lstm_hidden_state_dimension 100 +optimizer sgd +gradient_clipping_value 5.0 +remap_unknown_tokens_to_unk True +learning_rate 0.005 +check_for_lowercase True +check_for_digits_replaced_with_zeros True +model_folder ./models/NN_models/Test_November +conll_like_result_folder ./RESULTS/TEST_SAVER/NOVEMBER_DEBUG/ +model_name model_00001.ckpt diff --git a/code/DatasetCliner_experimental.py b/code/DatasetCliner_experimental.py new file mode 100644 index 0000000..8d949f2 --- /dev/null +++ b/code/DatasetCliner_experimental.py @@ -0,0 +1,441 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Oct 27 12:55:40 2017 + +@author: elena +""" + +import sklearn.preprocessing +import collections +import codecs +#import utils_nlp +import re +import time +#import token +import os +import pickle +import random +import numpy as np +import helper_dataset as hd + + + + +def lists_to_dataset_structure(sentences_tokens,sentence_tags,total_token_counter,token_count,label_count,character_count): + labels=[] + tokens=[] + new_label_sequence=[] + new_token_sequence=[] + + features="" + feature_file_name="" + feature_vector_size=0 + + for idx,sentence in enumerate(sentences_tokens): + for token_idx,token_i in enumerate(sentence): + new_token_sequence.append(token_i) + new_label_sequence.append(sentence_tags[idx][token_idx]) + + token_count[token_i] += 1 + label_count[sentence_tags[idx][token_idx]] += 1 + + if token_idx == len(sentence) - 1: + labels.append(new_label_sequence) + tokens.append(new_token_sequence) + new_token_sequence = [] + new_label_sequence = [] + # FEATURES ARE NOT SUPPORTED: Can be done if we are getting a third list that looks like [[f1,f2,f3],[f1,f2,f3]... for each token] + token_features=[] + features_as_array=np.array(token_features,dtype=np.dtype('int32')) + features_as_array=features_as_array.reshape((features_as_array.shape[0],1)) + features_as_array=np.transpose(features_as_array) + + features="" + feature_file_name="" + feature_vector_size=0 + + + total_token_counter+=1 + for character in token_i: + character_count[character] += 1 + + return labels, tokens, token_count, label_count, character_count,features,feature_file_name,feature_vector_size + + + + + + +class Dataset(object): + """A class for handling data sets.""" + + def __init__(self, name='', verbose=False, debug=False): + self.name = name + self.verbose = verbose + self.debug = debug + + + def _parse_dataset(self, dataset_filepath, dataset_type, sentences_list=[],tags_list=[], Not_here=False): + + token_count = collections.defaultdict(lambda: 0) #initialized by a function + label_count = collections.defaultdict(lambda: 0) + character_count = collections.defaultdict(lambda: 0) + longest_sentence=0 + + # Currently Not supported, features + #feature_file_name=os.getcwd()+os.sep+"test_cliner"+dataset_type+".hdf5" + # size_of_features=0 + + + # Currentlt Not supported - features + # f = h5py.File(feature_file_name, "w") + # dset = f.create_dataset("word-features", (0, size_of_features), maxshape=(None, size_of_features),dtype=np.dtype('int32'), chunks=True) #44 + #dt = h5py.special_dtype(vlen=np.dtype('int32')) + #sentence_words=f.create_dataset("sentences-words",(0,),dtype=dt,chunks=True,maxshape=(None,)) + + line_count =-1 + sent_count=-1 + total_token_counter=0 + token_counter_offset_sent=0 + + sentence_counter=0 + + tokens=[] + labels=[] + features=[] + + characters=[] # NOT USED (?) + + #extract token features for agumentation + token_features=[] + + token_lengths=[] + new_token_sequence=[] + new_label_sequence = [] + #new_token_features_sequence=[] + + + + + #labels, tokens, token_count, label_count, character_count,features,feature_file_name,feature_vector_size + if Not_here==False: + labels, tokens, token_count, label_count, character_count,features,feature_file_name,feature_vector_size=lists_to_dataset_structure(sentences_list,tags_list,total_token_counter,token_count,label_count,character_count) + + + + return labels, tokens, token_count, label_count, character_count,features,feature_file_name,feature_vector_size + + + + + def _convert_to_indices(self, dataset_types): + # Frank and Jennies Function + + tokens = self.tokens + labels = self.labels + token_to_index = self.token_to_index + character_to_index = self.character_to_index + label_to_index = self.label_to_index + index_to_label = self.index_to_label + + # Map tokens and labels to their indices + token_indices = {} + label_indices = {} + characters = {} + token_lengths = {} + character_indices = {} + character_indices_padded = {} + for dataset_type in dataset_types: + print (dataset_type) + token_indices[dataset_type] = [] + characters[dataset_type] = [] + character_indices[dataset_type] = [] + token_lengths[dataset_type] = [] + character_indices_padded[dataset_type] = [] + + for token_sequence in tokens[dataset_type]: + token_indices[dataset_type].append([token_to_index.get(token, self.UNK_TOKEN_INDEX) for token in token_sequence]) + characters[dataset_type].append([list(token) for token in token_sequence]) + character_indices[dataset_type].append([[character_to_index.get(character, random.randint(1, max(self.index_to_character.keys()))) for character in token] for token in token_sequence]) + token_lengths[dataset_type].append([len(token) for token in token_sequence]) + longest_token_length_in_sequence = max(token_lengths[dataset_type][-1]) + character_indices_padded[dataset_type].append([hd.pad_list(temp_token_indices, longest_token_length_in_sequence, self.PADDING_CHARACTER_INDEX) for temp_token_indices in character_indices[dataset_type][-1]]) + + label_indices[dataset_type] = [] + for label_sequence in labels[dataset_type]: + label_indices[dataset_type].append([label_to_index[label] for label in label_sequence]) + + label_binarizer = sklearn.preprocessing.LabelBinarizer() + label_binarizer.fit(range(max(index_to_label.keys()) + 1)) + label_vector_indices = {} + for dataset_type in dataset_types: + label_vector_indices[dataset_type] = [] + for label_indices_sequence in label_indices[dataset_type]: + label_vector_indices[dataset_type].append(label_binarizer.transform(label_indices_sequence)) + + return token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices + + def update_dataset(self, dataset_filepaths, dataset_types, Datasets_tokens, Datasets_labels): + + ''' + dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy' + Overwrites the data of type specified in dataset_types using the existing token_to_index, character_to_index, and label_to_index mappings. + ''' + + + # def _parse_dataset(self, dataset_filepath, dataset_type, sentences_list=[],tags_list=[], Not_here=False): + for dataset_type in dataset_types: + print (dataset_type) + self.labels[dataset_type], self.tokens[dataset_type], _, _, _,_,_,_= self._parse_dataset("",dataset_type, Datasets_tokens[dataset_type],Datasets_labels[dataset_type]) + + token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices = self._convert_to_indices(dataset_types) + + self.token_indices.update(token_indices) + self.label_indices.update(label_indices) + self.character_indices_padded.update(character_indices_padded) + self.character_indices.update(character_indices) + self.token_lengths.update(token_lengths) + self.characters.update(characters) + self.label_vector_indices.update(label_vector_indices) + + def load_dataset(self,avaliable_datasets_sent,avaliable_datasets_labels, dataset_filepaths, parameters, token_to_vector=None,pretrained_dataset=None): + ''' + dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy' + ''' + start_time = time.time() + print('Load dataset... ', end='', flush=True) + if parameters['token_pretrained_embedding_filepath'] != '': + if token_to_vector==None: + token_to_vector = hd.load_pretrained_token_embeddings(parameters) + else: + token_to_vector = {} + + all_tokens_in_pretraining_dataset = [] + all_characters_in_pretraining_dataset = [] + + if parameters['use_pretrained_model']: + + + + #temp_pretrained_dataset_adress="./models/NN_models/1235-4/dataset.pickle" #"./models/NN_models/1234-5/dataset.pickle" + if pretrained_dataset==None: + temp_pretrained_dataset_adress=parameters['model_folder']+os.sep+"dataset.pickle" + pretraining_dataset = pickle.load(open(temp_pretrained_dataset_adress, "rb")) + print ("Pre-loading Pre-trained dataset objects") + else: + pretraining_dataset=pretrained_dataset + print ("Pretrained dataset was pre-loaded") + + all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values() + all_characters_in_pretraining_dataset = pretraining_dataset.index_to_character.values() + + + remap_to_unk_count_threshold = 1 + self.UNK_TOKEN_INDEX = 0 + self.PADDING_CHARACTER_INDEX = 0 + self.tokens_mapped_to_unk = [] + self.UNK = 'UNK' + self.unique_labels = [] + labels = {} + tokens = {} + label_count = {} + token_count = {} + character_count = {} + + + features={} + features_file_names={} + feature_vector_size={} + #deploy + + for dataset_type in ['train', 'valid', 'test','deploy']: + Not_here=False + + if dataset_type not in avaliable_datasets_sent: + Not_here=True + #_parse_dataset(self, dataset_filepath,dataset_type,sentences_list="",tags_list="") + if Not_here==False: + labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type], features[dataset_type], \ + features_file_names[dataset_type],feature_vector_size[dataset_type] \ + = self._parse_dataset("", dataset_type, sentences_list=avaliable_datasets_sent[dataset_type], tags_list=avaliable_datasets_labels[dataset_type]) + + + if Not_here==True: + labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type], features[dataset_type], \ + features_file_names[dataset_type],feature_vector_size[dataset_type] \ + = self._parse_dataset("", dataset_type, sentences_list=[], tags_list=[]) # + + + token_count['all'] = {} + for token in list(token_count['train'].keys()) + list(token_count['valid'].keys()) + list(token_count['test'].keys()) + list(token_count['deploy'].keys()): + token_count['all'][token] = token_count['train'][token] + token_count['valid'][token] + token_count['test'][token] + token_count['deploy'][token] + + + + if parameters['load_all_pretrained_token_embeddings']: + for token in token_to_vector: + if token not in token_count['all']: + token_count['all'][token] = -1 + token_count['train'][token] = -1 + for token in all_tokens_in_pretraining_dataset: + if token not in token_count['all']: + token_count['all'][token] = -1 + token_count['train'][token] = -1 + + character_count['all'] = {} + for character in list(character_count['train'].keys()) + list(character_count['valid'].keys()) + list(character_count['test'].keys()) + list(character_count['deploy'].keys()): + character_count['all'][character] = character_count['train'][character] + character_count['valid'][character] + character_count['test'][character] + character_count['deploy'][character] + + for character in all_characters_in_pretraining_dataset: + if character not in character_count['all']: + character_count['all'][character] = -1 + character_count['train'][character] = -1 + + + label_count['all'] = {} + for character in list(label_count['train'].keys()) + list(label_count['valid'].keys()) + list(label_count['test'].keys()) + list(label_count['deploy'].keys()): + label_count['all'][character] = label_count['train'][character] + label_count['valid'][character] + label_count['test'][character] + label_count['deploy'][character] + + token_count['all'] = hd.order_dictionary(token_count['all'], 'value_key', reverse = True) + label_count['all'] = hd.order_dictionary(label_count['all'], 'key', reverse = False) + character_count['all'] = hd.order_dictionary(character_count['all'], 'value', reverse = True) + if self.verbose: print('character_count[\'all\']: {0}'.format(character_count['all'])) + + + + token_to_index = {} + token_to_index[self.UNK] = self.UNK_TOKEN_INDEX + iteration_number = 0 + number_of_unknown_tokens = 0 + if self.verbose: print("parameters['remap_unknown_tokens_to_unk']: {0}".format(parameters['remap_unknown_tokens_to_unk'])) + if self.verbose: print("len(token_count['train'].keys()): {0}".format(len(token_count['train'].keys()))) + for token, count in token_count['all'].items(): + if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1 + + if parameters['remap_unknown_tokens_to_unk'] == 1 and \ + (token_count['train'][token] == 0 or \ + parameters['load_only_pretrained_token_embeddings']) and \ + not hd.is_token_in_pretrained_embeddings(token, token_to_vector, parameters) and \ + token not in all_tokens_in_pretraining_dataset: + token_to_index[token] = self.UNK_TOKEN_INDEX + number_of_unknown_tokens += 1 + self.tokens_mapped_to_unk.append(token) + else: + token_to_index[token] = iteration_number + iteration_number += 1 + + infrequent_token_indices = [] + for token, count in token_count['train'].items(): + if 0 < count <= remap_to_unk_count_threshold: + infrequent_token_indices.append(token_to_index[token]) + #if self.verbose: print("len(token_count['train']): {0}".format(len(token_count['train']))) + # if self.verbose: print("len(infrequent_token_indices): {0}".format(len(infrequent_token_indices))) + + # Ensure that both B- and I- versions exist for each label + labels_without_bio = set() + for label in label_count['all'].keys(): + new_label = hd.remove_bio_from_label_name(label) + labels_without_bio.add(new_label) + for label in labels_without_bio: + if label == 'O': + continue + if parameters['tagging_format'] == 'bioes': + prefixes = ['B-', 'I-', 'E-', 'S-'] + else: + prefixes = ['B-', 'I-'] + for prefix in prefixes: + l = prefix + label + if l not in label_count['all']: + label_count['all'][l] = 0 + label_count['all'] = hd.order_dictionary(label_count['all'], 'key', reverse = False) + + if parameters['use_pretrained_model']: + + print ("USE_PRETRAINED_MODEL ACTIVE") + self.unique_labels = sorted(list(pretraining_dataset.label_to_index.keys())) + # Make sure labels are compatible with the pretraining dataset. + for label in label_count['all']: + if label not in pretraining_dataset.label_to_index: + raise AssertionError("The label {0} does not exist in the pretraining dataset. ".format(label) + + "Please ensure that only the following labels exist in the dataset: {0}".format(', '.join(self.unique_labels))) + label_to_index = pretraining_dataset.label_to_index.copy() + else: + label_to_index = {} + iteration_number = 0 + for label, count in label_count['all'].items(): + label_to_index[label] = iteration_number + iteration_number += 1 + self.unique_labels.append(label) + + + character_to_index = {} + iteration_number = 0 + for character, count in character_count['all'].items(): + if iteration_number == self.PADDING_CHARACTER_INDEX: iteration_number += 1 + character_to_index[character] = iteration_number + iteration_number += 1 + + + token_to_index = hd.order_dictionary(token_to_index, 'value', reverse = False) + if self.verbose: print('token_to_index: {0}'.format(token_to_index)) + index_to_token = hd.reverse_dictionary(token_to_index) + if parameters['remap_unknown_tokens_to_unk'] == 1: index_to_token[self.UNK_TOKEN_INDEX] = self.UNK + if self.verbose: print('index_to_token: {0}'.format(index_to_token)) + + label_to_index = hd.order_dictionary(label_to_index, 'value', reverse = False) + index_to_label = hd.reverse_dictionary(label_to_index) + + character_to_index = hd.order_dictionary(character_to_index, 'value', reverse = False) + index_to_character = hd.reverse_dictionary(character_to_index) + + self.token_to_index = token_to_index + self.index_to_token = index_to_token + self.index_to_character = index_to_character + self.character_to_index = character_to_index + self.index_to_label = index_to_label + self.label_to_index = label_to_index + + + self.tokens = tokens + self.labels = labels + + + + dataset_types=['train','test','valid','deploy'] + token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices = self._convert_to_indices(dataset_types) + + self.token_indices = token_indices + self.label_indices = label_indices + self.character_indices_padded = character_indices_padded + self.character_indices = character_indices + self.token_lengths = token_lengths + self.characters = characters + self.label_vector_indices = label_vector_indices + + self.number_of_classes = max(self.index_to_label.keys()) + 1 + self.vocabulary_size = max(self.index_to_token.keys()) + 1 + self.alphabet_size = max(self.index_to_character.keys()) + 1 + + + # unique_labels_of_interest is used to compute F1-scores. + self.unique_labels_of_interest = list(self.unique_labels) + self.unique_labels_of_interest.remove('O') + + self.unique_label_indices_of_interest = [] + for lab in self.unique_labels_of_interest: + self.unique_label_indices_of_interest.append(label_to_index[lab]) + + self.infrequent_token_indices = infrequent_token_indices + + + elapsed_time = time.time() - start_time + print('done ({0:.2f} seconds)'.format(elapsed_time)) + + + + self.feature_vector_size=0 + + + return token_to_vector + diff --git a/code/LSTM/__init__.py b/code/LSTM/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/code/conlleval b/code/conlleval new file mode 100644 index 0000000..70e4ad2 --- /dev/null +++ b/code/conlleval @@ -0,0 +1,315 @@ +#!/usr/bin/perl -w +# conlleval: evaluate result of processing CoNLL-2000 shared task +# usage: conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file +# README: http://cnts.uia.ac.be/conll2000/chunking/output.html +# options: l: generate LaTeX output for tables like in +# http://cnts.uia.ac.be/conll2003/ner/example.tex +# r: accept raw result tags (without B- and I- prefix; +# assumes one word per chunk) +# d: alternative delimiter tag (default is single space) +# o: alternative outside tag (default is O) +# note: the file should contain lines with items separated +# by $delimiter characters (default space). The final +# two items should contain the correct tag and the +# guessed tag in that order. Sentences should be +# separated from each other by empty lines or lines +# with $boundary fields (default -X-). +# url: http://lcg-www.uia.ac.be/conll2000/chunking/ +# started: 1998-09-25 +# version: 2004-01-26 +# author: Erik Tjong Kim Sang + +use strict; + +my $false = 0; +my $true = 42; + +my $boundary = "-X-"; # sentence boundary +my $correct; # current corpus chunk tag (I,O,B) +my $correctChunk = 0; # number of correctly identified chunks +my $correctTags = 0; # number of correct chunk tags +my $correctType; # type of current corpus chunk tag (NP,VP,etc.) +my $delimiter = " "; # field delimiter +my $FB1 = 0.0; # FB1 score (Van Rijsbergen 1979) +my $firstItem; # first feature (for sentence boundary checks) +my $foundCorrect = 0; # number of chunks in corpus +my $foundGuessed = 0; # number of identified chunks +my $guessed; # current guessed chunk tag +my $guessedType; # type of current guessed chunk tag +my $i; # miscellaneous counter +my $inCorrect = $false; # currently processed chunk is correct until now +my $lastCorrect = "O"; # previous chunk tag in corpus +my $latex = 0; # generate LaTeX formatted output +my $lastCorrectType = ""; # type of previously identified chunk tag +my $lastGuessed = "O"; # previously identified chunk tag +my $lastGuessedType = ""; # type of previous chunk tag in corpus +my $lastType; # temporary storage for detecting duplicates +my $line; # line +my $nbrOfFeatures = -1; # number of features per line +my $precision = 0.0; # precision score +my $oTag = "O"; # outside tag, default O +my $raw = 0; # raw input: add B to every token +my $recall = 0.0; # recall score +my $tokenCounter = 0; # token counter (ignores sentence breaks) + +my %correctChunk = (); # number of correctly identified chunks per type +my %foundCorrect = (); # number of chunks in corpus per type +my %foundGuessed = (); # number of identified chunks per type + +my @features; # features on line +my @sortedTypes; # sorted list of chunk type names + +# sanity check +while (@ARGV and $ARGV[0] =~ /^-/) { + if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); } + elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); } + elsif ($ARGV[0] eq "-d") { + shift(@ARGV); + if (not defined $ARGV[0]) { + die "conlleval: -d requires delimiter character"; + } + $delimiter = shift(@ARGV); + } elsif ($ARGV[0] eq "-o") { + shift(@ARGV); + if (not defined $ARGV[0]) { + die "conlleval: -o requires delimiter character"; + } + $oTag = shift(@ARGV); + } else { die "conlleval: unknown argument $ARGV[0]\n"; } +} +if (@ARGV) { die "conlleval: unexpected command line argument\n"; } +# process input +while () { + chomp($line = $_); + @features = split(/$delimiter/,$line); + if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; } + elsif ($nbrOfFeatures != $#features and @features != 0) { + printf STDERR "unexpected number of features: %d (%d)\n", + $#features+1,$nbrOfFeatures+1; + exit(1); + } + if (@features == 0 or + $features[0] eq $boundary) { @features = ($boundary,"O","O"); } + if (@features < 2) { + die "conlleval: unexpected number of features in line $line\n"; + } + if ($raw) { + if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } + if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } + if ($features[$#features] ne "O") { + $features[$#features] = "B-$features[$#features]"; + } + if ($features[$#features-1] ne "O") { + $features[$#features-1] = "B-$features[$#features-1]"; + } + } + # 20040126 ET code which allows hyphens in the types + if ($features[$#features] =~ /^([^-]*)-(.*)$/) { + $guessed = $1; + $guessedType = $2; + } else { + $guessed = $features[$#features]; + $guessedType = ""; + } + pop(@features); + if ($features[$#features] =~ /^([^-]*)-(.*)$/) { + $correct = $1; + $correctType = $2; + } else { + $correct = $features[$#features]; + $correctType = ""; + } + pop(@features); +# ($guessed,$guessedType) = split(/-/,pop(@features)); +# ($correct,$correctType) = split(/-/,pop(@features)); + $guessedType = $guessedType ? $guessedType : ""; + $correctType = $correctType ? $correctType : ""; + $firstItem = shift(@features); + + # 1999-06-26 sentence breaks should always be counted as out of chunk + if ( $firstItem eq $boundary ) { $guessed = "O"; } + + if ($inCorrect) { + if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and + &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and + $lastGuessedType eq $lastCorrectType) { + $inCorrect=$false; + $correctChunk++; + $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? + $correctChunk{$lastCorrectType}+1 : 1; + } elsif ( + &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != + &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or + $guessedType ne $correctType ) { + $inCorrect=$false; + } + } + + if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and + &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and + $guessedType eq $correctType) { $inCorrect = $true; } + + if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) { + $foundCorrect++; + $foundCorrect{$correctType} = $foundCorrect{$correctType} ? + $foundCorrect{$correctType}+1 : 1; + } + if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) { + $foundGuessed++; + $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ? + $foundGuessed{$guessedType}+1 : 1; + } + if ( $firstItem ne $boundary ) { + if ( $correct eq $guessed and $guessedType eq $correctType ) { + $correctTags++; + } + $tokenCounter++; + } + + $lastGuessed = $guessed; + $lastCorrect = $correct; + $lastGuessedType = $guessedType; + $lastCorrectType = $correctType; +} +if ($inCorrect) { + $correctChunk++; + $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? + $correctChunk{$lastCorrectType}+1 : 1; +} + +if (not $latex) { + # compute overall precision, recall and FB1 (default values are 0.0) + $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); + $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); + $FB1 = 2*$precision*$recall/($precision+$recall) + if ($precision+$recall > 0); + + # print overall performance + printf "processed $tokenCounter tokens with $foundCorrect phrases; "; + printf "found: $foundGuessed phrases; correct: $correctChunk.\n"; + if ($tokenCounter>0) { + printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter; + printf "precision: %6.2f%%; ",$precision; + printf "recall: %6.2f%%; ",$recall; + printf "FB1: %6.2f\n",$FB1; + } +} + +# sort chunk type names +undef($lastType); +@sortedTypes = (); +foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) { + if (not($lastType) or $lastType ne $i) { + push(@sortedTypes,($i)); + } + $lastType = $i; +} +# print performance per chunk type +if (not $latex) { + for $i (@sortedTypes) { + $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; + if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; } + else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } + if (not($foundCorrect{$i})) { $recall = 0.0; } + else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } + if ($precision+$recall == 0.0) { $FB1 = 0.0; } + else { $FB1 = 2*$precision*$recall/($precision+$recall); } + printf "%17s: ",$i; + printf "precision: %6.2f%%; ",$precision; + printf "recall: %6.2f%%; ",$recall; + printf "FB1: %6.2f %d\n",$FB1,$foundGuessed{$i}; + } +} else { + print " & Precision & Recall & F\$_{\\beta=1} \\\\\\hline"; + for $i (@sortedTypes) { + $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; + if (not($foundGuessed{$i})) { $precision = 0.0; } + else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } + if (not($foundCorrect{$i})) { $recall = 0.0; } + else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } + if ($precision+$recall == 0.0) { $FB1 = 0.0; } + else { $FB1 = 2*$precision*$recall/($precision+$recall); } + printf "\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\", + $i,$precision,$recall,$FB1; + } + print "\\hline\n"; + $precision = 0.0; + $recall = 0; + $FB1 = 0.0; + $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); + $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); + $FB1 = 2*$precision*$recall/($precision+$recall) + if ($precision+$recall > 0); + printf "Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n", + $precision,$recall,$FB1; +} + +exit 0; + +# endOfChunk: checks if a chunk ended between the previous and current word +# arguments: previous and current chunk tags, previous and current types +# note: this code is capable of handling other chunk representations +# than the default CoNLL-2000 ones, see EACL'99 paper of Tjong +# Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 + +sub endOfChunk { + my $prevTag = shift(@_); + my $tag = shift(@_); + my $prevType = shift(@_); + my $type = shift(@_); + my $chunkEnd = $false; + + if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; } + if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; } + if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; } + if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } + + if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; } + if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; } + if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; } + if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } + + if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { + $chunkEnd = $true; + } + + # corrected 1998-12-22: these chunks are assumed to have length 1 + if ( $prevTag eq "]" ) { $chunkEnd = $true; } + if ( $prevTag eq "[" ) { $chunkEnd = $true; } + + return($chunkEnd); +} + +# startOfChunk: checks if a chunk started between the previous and current word +# arguments: previous and current chunk tags, previous and current types +# note: this code is capable of handling other chunk representations +# than the default CoNLL-2000 ones, see EACL'99 paper of Tjong +# Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 + +sub startOfChunk { + my $prevTag = shift(@_); + my $tag = shift(@_); + my $prevType = shift(@_); + my $type = shift(@_); + my $chunkStart = $false; + + if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; } + if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; } + if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; } + if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } + + if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; } + if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; } + if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; } + if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } + + if ($tag ne "O" and $tag ne "." and $prevType ne $type) { + $chunkStart = $true; + } + + # corrected 1998-12-22: these chunks are assumed to have length 1 + if ( $tag eq "[" ) { $chunkStart = $true; } + if ( $tag eq "]" ) { $chunkStart = $true; } + + return($chunkStart); +} diff --git a/code/entity_lstm.py b/code/entity_lstm.py new file mode 100644 index 0000000..c9013f7 --- /dev/null +++ b/code/entity_lstm.py @@ -0,0 +1,491 @@ +import tensorflow as tf +import numpy as np +import codecs +import re +import time +#import utils_tf +#import utils_nlp +import helper_dataset as hd +import tensorflow.contrib.layers as layers +import os +import pickle +import utils_tf + +# TO DO: ADD CNN LAYER + +def bidirectional_GRU(input,hidden_state_dimension,initializer,sequence_length=None, output_sequence=True): + print ("Biderectional GRU") + with tf.variable_scope("biderectional_GRU"): + if sequence_length==None: + batch_size=1 # ONE WORD(char) + sequence_length = tf.shape(input)[1] + sequence_length = tf.expand_dims(sequence_length, axis=0, name='sequence_length') #NOT SURE IF IT EVER HAPPENS + else: + batch_size= tf.shape(sequence_length)[0] + + + gru_cell={} + initial_state={} + for direction in ["forward","backward"]: + gru_cell[direction] = tf.contrib.rnn.GRUCell(hidden_state_dimension) + initial_state[direction]=gru_cell[direction].zero_state(batch_size, tf.float32) + outputs,final_states = tf.nn.bidirectional_dynamic_rnn(gru_cell["forward"],gru_cell["backward"],input, sequence_length=sequence_length,initial_state_fw=initial_state["forward"],initial_state_bw=initial_state["backward"]) + + + if output_sequence==True: + outputs_forward, outputs_backward = outputs + output = tf.concat([outputs_forward, outputs_backward], axis=2, name='output_sequence') + + else: + final_states_forward, final_states_backward = final_states + + output = tf.concat([final_states_forward, final_states_backward], axis=1, name='output') #111 + + return output + + + + +def bidirectional_LSTM(input, hidden_state_dimension, initializer, sequence_length=None, output_sequence=True): + + print ("Biderectional LSTM") + with tf.variable_scope("bidirectional_LSTM"): + if sequence_length == None: + batch_size = 1 + sequence_length = tf.shape(input)[1] + sequence_length = tf.expand_dims(sequence_length, axis=0, name='sequence_length') + else: + batch_size = tf.shape(sequence_length)[0] + + lstm_cell = {} + initial_state = {} + for direction in ["forward", "backward"]: + with tf.variable_scope(direction): + # LSTM cell + lstm_cell[direction] = tf.contrib.rnn.CoupledInputForgetGateLSTMCell(hidden_state_dimension, use_peepholes=False, forget_bias=1.0, initializer=initializer, state_is_tuple=True, activation=tf.tanh) # tf.tanh (default to RELU) + # lstm_cell[direction] = tf.contrib.rnn_cell.GRUCell(hidden_state_dimension,activation=tf.tanh,) + + + # initial state: http://stackoverflow.com/questions/38441589/tensorflow-rnn-initial-state + initial_cell_state = tf.get_variable("initial_cell_state", shape=[1, hidden_state_dimension], dtype=tf.float32, initializer=initializer) + initial_output_state = tf.get_variable("initial_output_state", shape=[1, hidden_state_dimension], dtype=tf.float32, initializer=initializer) + c_states = tf.tile(initial_cell_state, tf.stack([batch_size, 1])) + h_states = tf.tile(initial_output_state, tf.stack([batch_size, 1])) + initial_state[direction] = tf.contrib.rnn.LSTMStateTuple(c_states, h_states) + + # sequence_length must be provided for tf.nn.bidirectional_dynamic_rnn due to internal bug + outputs, final_states = tf.nn.bidirectional_dynamic_rnn(lstm_cell["forward"], + lstm_cell["backward"], + input, + dtype=tf.float32, + sequence_length=sequence_length, + initial_state_fw=initial_state["forward"], + initial_state_bw=initial_state["backward"]) + if output_sequence == True: + outputs_forward, outputs_backward = outputs + output = tf.concat([outputs_forward, outputs_backward], axis=2, name='output_sequence') + else: + # max pooling +# outputs_forward, outputs_backward = outputs +# output = tf.concat([outputs_forward, outputs_backward], axis=2, name='output_sequence') +# output = tf.reduce_max(output, axis=1, name='output') + # last pooling + final_states_forward, final_states_backward = final_states + output = tf.concat([final_states_forward[1], final_states_backward[1]], axis=1, name='output') + + return output + + + + +class EntityLSTM(object): + """ + An LSTM architecture for named entity recognition. + Uses a character embedding layer followed by an LSTM to generate vector representation from characters for each token. + Then the character vector is concatenated with token embedding vector, which is input to another LSTM followed by a CRF layer. + """ + def __init__(self, dataset, parameters): + + self.verbose = False + self.feature_vector_length=parameters['Feature_vector_length'] + + # Placeholders for input, output and dropout + self.input_token_indices = tf.placeholder(tf.int32, [None], name="input_token_indices") + self.input_label_indices_vector = tf.placeholder(tf.float32, [None, dataset.number_of_classes], name="input_label_indices_vector") + self.input_label_indices_flat = tf.placeholder(tf.int32, [None], name="input_label_indices_flat") + self.input_token_character_indices = tf.placeholder(tf.int32, [None, None], name="input_token_indices") + self.input_token_lengths = tf.placeholder(tf.int32, [None], name="input_token_lengths") + self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") + + self.input_features=tf.placeholder(tf.float32, [None,self.feature_vector_length], name="features") + + + self.vocabulary_size=dataset.vocabulary_size + + # Internal parameters + initializer = tf.contrib.layers.xavier_initializer() + + if parameters['use_character_lstm']: + with tf.variable_scope("character_embedding"): + self.character_embedding_weights = tf.get_variable( + "character_embedding_weights", + shape=[dataset.alphabet_size, parameters['character_embedding_dimension']], + initializer=initializer) + embedded_characters = tf.nn.embedding_lookup(self.character_embedding_weights, self.input_token_character_indices, name='embedded_characters') + if self.verbose: print("embedded_characters: {0}".format(embedded_characters)) + # utils_tf.variable_summaries(self.character_embedding_weights) + + # Character LSTM layer + with tf.variable_scope('character_lstm') as vs: + if parameters['Use_LSTM']==True: + character_lstm_output = bidirectional_LSTM(embedded_characters, parameters['character_lstm_hidden_state_dimension'], initializer, + sequence_length=self.input_token_lengths, output_sequence=False) + self.character_lstm_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) + else: + character_lstm_output = bidirectional_GRU(embedded_characters, parameters['character_lstm_hidden_state_dimension'], initializer, + sequence_length=self.input_token_lengths, output_sequence=False) + # Attention, not implemented + + # with tf.variable_scope('attention') as scope: + # word_level_output = task_specific_attention(character_lstm_output,dataset.token_lengths,scope=scope) + # print (w) + + # sentence_inputs = tf.reshape(word_level_output, [self.document_size, self.sentence_size, self.word_output_size]) + + + + + + # Token embedding layer + with tf.variable_scope("token_embedding"): + self.token_embedding_weights = tf.get_variable( + "token_embedding_weights", + shape=[dataset.vocabulary_size, parameters['token_embedding_dimension']], + initializer=initializer, + trainable=not parameters['freeze_token_embeddings']) + embedded_tokens = tf.nn.embedding_lookup(self.token_embedding_weights, self.input_token_indices) + # utils_tf.variable_summaries(self.token_embedding_weights) + + # Concatenate character LSTM outputs and token embeddings + if parameters['use_character_lstm']: + with tf.variable_scope("concatenate_token_and_character_vectors"): + if self.verbose: print('embedded_tokens: {0}'.format(embedded_tokens)) + token_lstm_input = tf.concat([character_lstm_output, embedded_tokens], axis=1, name='token_lstm_input') + if self.verbose: print("token_lstm_input: {0}".format(token_lstm_input)) + else: + token_lstm_input = embedded_tokens + + if parameters['use_features_before_final_lstm']: + with tf.variable_scope("features_argumentation_pre_LSTM"): + token_lstm_input=tf.concat([token_lstm_input, self.input_features], 1) + print (token_lstm_input) + + + # Add dropout + with tf.variable_scope("dropout"): + token_lstm_input_drop = tf.nn.dropout(token_lstm_input, self.dropout_keep_prob, name='token_lstm_input_drop') + if self.verbose: print("token_lstm_input_drop: {0}".format(token_lstm_input_drop)) + # https://www.tensorflow.org/api_guides/python/contrib.rnn + # Prepare data shape to match `rnn` function requirements + # Current data input shape: (batch_size, n_steps, n_input) + # Required shape: 'n_steps' tensors list of shape (batch_size, n_input) + token_lstm_input_drop_expanded = tf.expand_dims(token_lstm_input_drop, axis=0, name='token_lstm_input_drop_expanded') + if self.verbose: print("token_lstm_input_drop_expanded: {0}".format(token_lstm_input_drop_expanded)) + + #if parameters['use_features_before_final_lstm']: + # with tf.variable_scope("features_argumentation_pre_LSTM"): + # token_lstm_input_drop_expanded=tf.concat([token_lstm_input_drop_expanded, self.input_features], 1) + # print (token_lstm_input_drop_expanded) + + # Token LSTM layer + with tf.variable_scope('token_lstm') as vs: + if parameters['Use_LSTM']==True: token_lstm_output = bidirectional_LSTM(token_lstm_input_drop_expanded, parameters['token_lstm_hidden_state_dimension'], initializer, output_sequence=True) + else: token_lstm_output = bidirectional_GRU(token_lstm_input_drop_expanded, parameters['token_lstm_hidden_state_dimension'], initializer, output_sequence=True) + token_lstm_output_squeezed = tf.squeeze(token_lstm_output, axis=0) + self.token_lstm_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) + + # Needed only if Bidirectional LSTM is used for token level + with tf.variable_scope("feedforward_after_lstm") as vs: + W = tf.get_variable( + "W", + shape=[2 * parameters['token_lstm_hidden_state_dimension'], parameters['token_lstm_hidden_state_dimension']], + initializer=initializer) + b = tf.Variable(tf.constant(0.0, shape=[parameters['token_lstm_hidden_state_dimension']]), name="bias") + outputs = tf.nn.xw_plus_b(token_lstm_output_squeezed, W, b, name="output_before_tanh") + outputs = tf.nn.tanh(outputs, name="output_after_tanh") + self.token_lstm_variables += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) + + with tf.variable_scope("feedforward_before_crf") as vs: + W = tf.get_variable( + "W", + shape=[parameters['token_lstm_hidden_state_dimension'], dataset.number_of_classes], + initializer=initializer) + b = tf.Variable(tf.constant(0.0, shape=[dataset.number_of_classes]), name="bias") + scores = tf.nn.xw_plus_b(outputs, W, b, name="scores") + self.unary_scores = scores + self.predictions = tf.argmax(self.unary_scores, 1, name="predictions") + #utils_tf.variable_summaries(W) + # utils_tf.variable_summaries(b) + self.feedforward_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) + + # CRF layer + if parameters['use_crf']: + print ("CRF IS IN USE") + with tf.variable_scope("crf") as vs: + # Add start and end tokens + small_score = -1000.0 + large_score = 0.0 + sequence_length = tf.shape(self.unary_scores)[0] + unary_scores_with_start_and_end = tf.concat([self.unary_scores, tf.tile( tf.constant(small_score, shape=[1, 2]) , [sequence_length, 1])], 1) + start_unary_scores = [[small_score] * dataset.number_of_classes + [large_score, small_score]] + end_unary_scores = [[small_score] * dataset.number_of_classes + [small_score, large_score]] + self.unary_scores = tf.concat([start_unary_scores, unary_scores_with_start_and_end, end_unary_scores], 0) + start_index = dataset.number_of_classes + end_index = dataset.number_of_classes + 1 + input_label_indices_flat_with_start_and_end = tf.concat([ tf.constant(start_index, shape=[1]), self.input_label_indices_flat, tf.constant(end_index, shape=[1]) ], 0) + + # Apply CRF layer + sequence_length = tf.shape(self.unary_scores)[0] + sequence_lengths = tf.expand_dims(sequence_length, axis=0, name='sequence_lengths') + unary_scores_expanded = tf.expand_dims(self.unary_scores, axis=0, name='unary_scores_expanded') + input_label_indices_flat_batch = tf.expand_dims(input_label_indices_flat_with_start_and_end, axis=0, name='input_label_indices_flat_batch') + if self.verbose: print('unary_scores_expanded: {0}'.format(unary_scores_expanded)) + if self.verbose: print('input_label_indices_flat_batch: {0}'.format(input_label_indices_flat_batch)) + if self.verbose: print("sequence_lengths: {0}".format(sequence_lengths)) + # https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/crf + # Compute the log-likelihood of the gold sequences and keep the transition params for inference at test time. + self.transition_parameters=tf.get_variable( + "transitions", + shape=[dataset.number_of_classes+2, dataset.number_of_classes+2], + initializer=initializer) + #utils_tf.variable_summaries(self.transition_parameters) + log_likelihood, _ = tf.contrib.crf.crf_log_likelihood( + unary_scores_expanded, input_label_indices_flat_batch, sequence_lengths, transition_params=self.transition_parameters) + self.loss = tf.reduce_mean(-log_likelihood, name='cross_entropy_mean_loss') + self.accuracy = tf.constant(1) + + self.crf_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) # LATER FOR RESTORE + + # Do not use CRF layer + else: + with tf.variable_scope("crf") as vs: + self.transition_parameters = tf.get_variable( + "transitions", + shape=[dataset.number_of_classes+2, dataset.number_of_classes+2], + initializer=initializer) + # utils_tf.variable_summaries(self.transition_parameters) + self.crf_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) + + # Calculate mean cross-entropy loss + with tf.variable_scope("loss"): + losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.unary_scores, labels=self.input_label_indices_vector, name='softmax') + self.loss = tf.reduce_mean(losses, name='cross_entropy_mean_loss') + with tf.variable_scope("accuracy"): + correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_label_indices_vector, 1)) + self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, 'float'), name='accuracy') + + self.define_training_procedure(parameters) + self.summary_op = tf.summary.merge_all() + self.saver = tf.train.Saver(max_to_keep=100) + + + + def define_training_procedure(self, parameters): + # Define training procedure + self.global_step = tf.Variable(0, name="global_step", trainable=False) + if parameters['optimizer'] == 'adam': + self.optimizer = tf.train.AdamOptimizer(parameters['learning_rate']) + elif parameters['optimizer'] == 'sgd': + self.optimizer = tf.train.GradientDescentOptimizer(parameters['learning_rate']) + elif parameters['optimizer'] == 'adadelta': + self.optimizer = tf.train.AdadeltaOptimizer(parameters['learning_rate']) + else: + raise ValueError('The lr_method parameter must be either adadelta, adam or sgd.') + + grads_and_vars = self.optimizer.compute_gradients(self.loss) + #MODIFY: + if parameters['gradient_clipping_value']: + def ClipIfNotNone(grad): + if grad is None: + return grad + return tf.clip_by_value(grad, -5.0, 5.0) + grads_and_vars = [(ClipIfNotNone(grad), var) for grad, var in grads_and_vars] + + self.train_op = self.optimizer.apply_gradients(grads_and_vars, global_step=self.global_step) + + + def load_pretrained_token_embeddings(self, sess, dataset, parameters, token_to_vector=None): + if parameters['token_pretrained_embedding_filepath'] == '': + return + # Load embeddings + start_time = time.time() + print('Load token embeddings... ', end='', flush=True) + if token_to_vector == None: + token_to_vector = hd.load_pretrained_token_embeddings(parameters) + + initial_weights = sess.run(self.token_embedding_weights.read_value()) + number_of_loaded_word_vectors = 0 + number_of_token_original_case_found = 0 + number_of_token_lowercase_found = 0 + number_of_token_digits_replaced_with_zeros_found = 0 + number_of_token_lowercase_and_digits_replaced_with_zeros_found = 0 + for token in dataset.token_to_index.keys(): + if token in token_to_vector.keys(): + initial_weights[dataset.token_to_index[token]] = token_to_vector[token] + number_of_token_original_case_found += 1 + elif parameters['check_for_lowercase'] and token.lower() in token_to_vector.keys(): + initial_weights[dataset.token_to_index[token]] = token_to_vector[token.lower()] + number_of_token_lowercase_found += 1 + elif parameters['check_for_digits_replaced_with_zeros'] and re.sub('\d', '0', token) in token_to_vector.keys(): + initial_weights[dataset.token_to_index[token]] = token_to_vector[re.sub('\d', '0', token)] + number_of_token_digits_replaced_with_zeros_found += 1 + elif parameters['check_for_lowercase'] and parameters['check_for_digits_replaced_with_zeros'] and re.sub('\d', '0', token.lower()) in token_to_vector.keys(): + initial_weights[dataset.token_to_index[token]] = token_to_vector[re.sub('\d', '0', token.lower())] + number_of_token_lowercase_and_digits_replaced_with_zeros_found += 1 + else: + continue + number_of_loaded_word_vectors += 1 + elapsed_time = time.time() - start_time + print('done ({0:.2f} seconds)'.format(elapsed_time)) + print("number_of_token_original_case_found: {0}".format(number_of_token_original_case_found)) + print("number_of_token_lowercase_found: {0}".format(number_of_token_lowercase_found)) + print("number_of_token_digits_replaced_with_zeros_found: {0}".format(number_of_token_digits_replaced_with_zeros_found)) + print("number_of_token_lowercase_and_digits_replaced_with_zeros_found: {0}".format(number_of_token_lowercase_and_digits_replaced_with_zeros_found)) + print('number_of_loaded_word_vectors: {0}'.format(number_of_loaded_word_vectors)) + print("dataset.vocabulary_size: {0}".format(dataset.vocabulary_size)) + sess.run(self.token_embedding_weights.assign(initial_weights)) + + + def load_embeddings_from_pretrained_model(self, sess, dataset, pretraining_dataset, pretrained_embedding_weights, embedding_type='token'): + if embedding_type == 'token': + embedding_weights = self.token_embedding_weights + index_to_string = dataset.index_to_token + pretraining_string_to_index = pretraining_dataset.token_to_index + elif embedding_type == 'character': + embedding_weights = self.character_embedding_weights + index_to_string = dataset.index_to_character + pretraining_string_to_index = pretraining_dataset.character_to_index + # Load embeddings + start_time = time.time() + print('Load {0} embeddings from pretrained model... '.format(embedding_type), end='', flush=True) + initial_weights = sess.run(embedding_weights.read_value()) + + if embedding_type == 'token': + initial_weights[dataset.UNK_TOKEN_INDEX] = pretrained_embedding_weights[pretraining_dataset.UNK_TOKEN_INDEX] + elif embedding_type == 'character': + initial_weights[dataset.PADDING_CHARACTER_INDEX] = pretrained_embedding_weights[pretraining_dataset.PADDING_CHARACTER_INDEX] + + number_of_loaded_vectors = 1 + for index, string in index_to_string.items(): + if index == dataset.UNK_TOKEN_INDEX: + continue + if string in pretraining_string_to_index.keys(): + initial_weights[index] = pretrained_embedding_weights[pretraining_string_to_index[string]] + number_of_loaded_vectors += 1 + elapsed_time = time.time() - start_time + print('done ({0:.2f} seconds)'.format(elapsed_time)) + print("number_of_loaded_vectors: {0}".format(number_of_loaded_vectors)) + if embedding_type == 'token': + print("dataset.vocabulary_size: {0}".format(dataset.vocabulary_size)) + elif embedding_type == 'character': + print("dataset.alphabet_size: {0}".format(dataset.alphabet_size)) + sess.run(embedding_weights.assign(initial_weights)) + + + def resize_without_redoing_model(self, parameters,new_dataset_vocab_size,sess): + "" + utils_tf.resize_tensor_variable(sess, self.token_embedding_weights, [new_dataset_vocab_size, parameters['token_embedding_dimension']]) + + + + def restore_from_pretrained_model(self, parameters, dataset, sess, token_to_vector=None,pretrained_dataset=None): + + temp_pretrained_dataset_adress=parameters['model_folder']+os.sep+"dataset.pickle" + temp_pretrained_model_adress=parameters['model_folder']+os.sep+parameters['model_name'] + + print (temp_pretrained_model_adress) + + if pretrained_dataset==None: + pretraining_dataset = pickle.load(open(temp_pretrained_dataset_adress, 'rb')) + else: + print ("PRETRAINING HERE") + pretraining_dataset=pretrained_dataset + + + pretrained_model_checkpoint_filepath = temp_pretrained_model_adress + + assert pretraining_dataset.index_to_label == dataset.index_to_label # DEBUG fron F&J + + # If the token and character mappings are exactly the same + if pretraining_dataset.index_to_token == dataset.index_to_token and pretraining_dataset.index_to_character == dataset.index_to_character: + + # Restore the pretrained model + self.saver.restore(sess, pretrained_model_checkpoint_filepath) # Works only when the dimensions of tensor variables are matched. + del pretraining_dataset + + # If the token and character mappings are different between the pretrained model and the current model + else: + print ("INDEX TO TOKEN DO NOT MATCH") + + # Resize the token and character embedding weights to match them with the pretrained model (required in order to restore the pretrained model) + utils_tf.resize_tensor_variable(sess, self.character_embedding_weights, [pretraining_dataset.alphabet_size, parameters['character_embedding_dimension']]) + utils_tf.resize_tensor_variable(sess, self.token_embedding_weights, [pretraining_dataset.vocabulary_size, parameters['token_embedding_dimension']]) + + # Restore the pretrained model + self.saver.restore(sess, pretrained_model_checkpoint_filepath) # Works only when the dimensions of tensor variables are matched. + + # Get pretrained embeddings + character_embedding_weights, token_embedding_weights = sess.run([self.character_embedding_weights, self.token_embedding_weights]) + + # Restore the sizes of token and character embedding weights + utils_tf.resize_tensor_variable(sess, self.character_embedding_weights, [dataset.alphabet_size, parameters['character_embedding_dimension']]) + utils_tf.resize_tensor_variable(sess, self.token_embedding_weights, [dataset.vocabulary_size, parameters['token_embedding_dimension']]) + + # Re-initialize the token and character embedding weights + sess.run(tf.variables_initializer([self.character_embedding_weights, self.token_embedding_weights])) + + # Load embedding weights from pretrained token embeddings first + self.load_pretrained_token_embeddings(sess, dataset, parameters, token_to_vector=token_to_vector) + self.load_embeddings_from_pretrained_model(sess, dataset, pretraining_dataset, token_embedding_weights, embedding_type='token') + self.load_embeddings_from_pretrained_model(sess, dataset, pretraining_dataset, character_embedding_weights, embedding_type='character') + + del pretraining_dataset + del character_embedding_weights + del token_embedding_weights + + # Get transition parameters + transition_params_trained = sess.run(self.transition_parameters) + + parameters={'reload_character_embeddings': True, 'reload_character_lstm':True, 'reload_token_embeddings':True, 'reload_token_lstm':True, 'reload_feedforward':True, 'reload_crf':True} + if not parameters['reload_character_embeddings']: + sess.run(tf.variables_initializer([self.character_embedding_weights])) + if not parameters['reload_character_lstm']: + sess.run(tf.variables_initializer(self.character_lstm_variables)) + if not parameters['reload_token_embeddings']: + sess.run(tf.variables_initializer([self.token_embedding_weights])) + if not parameters['reload_token_lstm']: + sess.run(tf.variables_initializer(self.token_lstm_variables)) + if not parameters['reload_feedforward']: + sess.run(tf.variables_initializer(self.feedforward_variables)) + if not parameters['reload_crf']: + sess.run(tf.variables_initializer(self.crf_variables)) + + + + return transition_params_trained + + + + + + + + + + + + + + + + diff --git a/code/evaluation_LSTM.py b/code/evaluation_LSTM.py new file mode 100644 index 0000000..4fb276e --- /dev/null +++ b/code/evaluation_LSTM.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +""" +Created on Tue Aug 8 13:38:57 2017 + +@author: elena +""" +import numpy as np +import sklearn.metrics +import os +import time +import helper_dataset as hd + +def assess_model(y_pred, y_true, labels, target_names, labels_with_o, target_names_with_o, evaluation_mode='bio', verbose=False): + results = {} + assert len(y_true) == len(y_pred) + + # Classification report + classification_report = sklearn.metrics.classification_report(y_true, y_pred, labels=labels, target_names=target_names, sample_weight=None, digits=4) + results['classification_report'] = classification_report + + # F1 scores + results['f1_score'] = {} + for f1_average_style in ['weighted', 'micro', 'macro']: + results['f1_score'][f1_average_style] = sklearn.metrics.f1_score(y_true, y_pred, average=f1_average_style, labels=labels)*100 + results['f1_score']['per_label'] = [x*100 for x in sklearn.metrics.precision_recall_fscore_support(y_true, y_pred, average=None, labels=labels)[2].tolist()] + results['accuracy_score'] = sklearn.metrics.accuracy_score(y_true, y_pred)*100 + + print (results['classification_report']) + print (results['f1_score']['per_label']) + + return results + +def remap_labels(y_pred, y_true, dataset, evaluation_mode='bio'): + ''' + y_pred: list of predicted labels + y_true: list of gold labels + evaluation_mode: 'bio', 'token', or 'binary' + + Both y_pred and y_true must use label indices and names specified in the dataset +# (dataset.unique_label_indices_of_interest, dataset.unique_label_indices_of_interest). + ''' + all_unique_labels = dataset.unique_labels + if evaluation_mode == 'bio': + # sort label to index + new_label_names = all_unique_labels[:] + new_label_names.remove('O') + new_label_names.sort(key=lambda x: (hd.remove_bio_from_label_name(x), x)) + new_label_names.append('O') + new_label_indices = list(range(len(new_label_names))) + new_label_to_index = dict(zip(new_label_names, new_label_indices)) + + remap_index = {} + for i, label_name in enumerate(new_label_names): + label_index = dataset.label_to_index[label_name] + remap_index[label_index] = i + + else: + raise ValueError("At this point only 'bio' is accepted") + + new_y_pred = [ remap_index[label_index] for label_index in y_pred ] + new_y_true = [ remap_index[label_index] for label_index in y_true ] + + new_label_indices_with_o = new_label_indices[:] + new_label_names_with_o = new_label_names[:] + new_label_names.remove('O') + new_label_indices.remove(new_label_to_index['O']) + + return new_y_pred, new_y_true, new_label_indices, new_label_names, new_label_indices_with_o, new_label_names_with_o diff --git a/code/helper_dataset.py b/code/helper_dataset.py new file mode 100644 index 0000000..0e6a6fd --- /dev/null +++ b/code/helper_dataset.py @@ -0,0 +1,295 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +""" +Created on Fri Jun 16 15:34:02 2017 + +@author: elena +""" +from __future__ import print_function +import codecs +import numpy as np +#from nltk import word_tokenize +import collections +import operator +import re +import numpy as np +import os +import h5py +import tensorflow as tf +import shutil +#import Dataset as dset +#import nltk + +def variable_summaries(var): + ''' + Attach a lot of summaries to a Tensor (for TensorBoard visualization). + From https://www.tensorflow.org/get_started/summaries_and_tensorboard + ''' + with tf.name_scope('summaries'): + mean = tf.reduce_mean(var) + tf.summary.scalar('mean', mean) + with tf.name_scope('stddev'): + stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) + tf.summary.scalar('stddev', stddev) + tf.summary.scalar('max', tf.reduce_max(var)) + tf.summary.scalar('min', tf.reduce_min(var)) + tf.summary.histogram('histogram', var) + +def load_parameters_from_file(parameters_file_name): + d = {} + with open(parameters_file_name) as f: + for line in f: + # if not line.strip(): + (key, val) = line.split() + if is_number(val): + d[key]=int(val) + continue + if is_boolean(val): + d[key]=bool(val) + continue + d[key] = val + return d + +def is_number(s): + try: + int(s) + return True + except ValueError: + return False + +def is_boolean(s): + if s in ['True','False']: + return True + else: return False + + + + + +def get_features_for_sentence(dataset_adress,sentence_number): + + reading_table=h5py.File(dataset_adress,'r') + word_features=reading_table["word-features"] + sentences_words=reading_table["sentences-words"] + current_sentence=sentences_words[sentence_number] + indicies=list(current_sentence) + extracted_feature_matrix=word_features[indicies[0]:indicies[-1]+1,:] + + list_features=extracted_feature_matrix.tolist() + #list_features=[x.tolist() for x in list_features] + return list_features + + + +#z=get_features_for_sentence("./data-fordataset/CATEGORIES-INCLUDED/train.hdf5",0) +#print (len(z)) +#print (len(z[0])) + + +def get_size_of_features(main_data_file_address): + f = codecs.open(main_data_file_address, 'r', 'UTF-8') + size_of_the_features_vector=0 + for line in f: + line = line.strip().split(' ') + if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]: continue + num_of_elem_line=len(line) + token_features=[x for ind, x in enumerate(line) if ind not in [num_of_elem_line-1,num_of_elem_line-2] and x!=""] + size_of_the_features_vector=len(token_features) + # print (size_of_the_features_vector) + break + return size_of_the_features_vector + + + + +def create_folder_if_not_exists(directory): + ''' + Create the folder if it doesn't exist already. + ''' + if not os.path.exists(directory): + os.makedirs(directory) + +def copytree(src, dst, symlinks=False, ignore=None): + ''' + http://stackoverflow.com/questions/1868714/how-do-i-copy-an-entire-directory-of-files-into-an-existing-directory-using-pyth + ''' + for item in os.listdir(src): + s = os.path.join(src, item) + d = os.path.join(dst, item) + if os.path.isdir(s): + shutil.copytree(s, d, symlinks, ignore) + else: + shutil.copy2(s, d) + + + +def string_to_list_of_lists(string): #NOT IN USE, was used for old feature represntation + list_of_tokens=[] + feature_list=[] + features=string.split(" ") + for feature in features: + if feature=="#newtoken#": + list_of_tokens.append(feature_list) + feature_list=[] + else: + try: + feature_list.append(float(feature)) + except ValueError: + continue + return list_of_tokens + +def get_valid_dataset_filepaths(parameters): + dataset_filepaths = {} + for dataset_type in ['train', 'test']: + dataset_filepaths[dataset_type] = os.path.join(parameters['dataset_text_folder'], '{0}.txt'.format(dataset_type)) + return dataset_filepaths + +def remove_file_name_from_the_path_string(path_string): + get_separator=os.sep + break_path=path_string.split(get_separator) + new_path=[n for idx,n in enumerate(break_path) if idx!=len(break_path)-1 ] + new_path=get_separator.join(new_path) + return new_path + + +def order_dictionary(dictionary, mode, reverse=False): + ''' + Order a dictionary by 'key' or 'value'. + mode should be either 'key' or 'value' + http://stackoverflow.com/questions/613183/sort-a-python-dictionary-by-value + ''' + + if mode =='key': + return collections.OrderedDict(sorted(dictionary.items(), + key=operator.itemgetter(0), + reverse=reverse)) + elif mode =='value': + return collections.OrderedDict(sorted(dictionary.items(), + key=operator.itemgetter(1), + reverse=reverse)) + elif mode =='key_value': + return collections.OrderedDict(sorted(dictionary.items(), + reverse=reverse)) + elif mode =='value_key': + return collections.OrderedDict(sorted(dictionary.items(), + key=lambda x: (x[1], x[0]), + reverse=reverse)) + else: + raise ValueError("Unknown mode. Should be 'key' or 'value'") + + +def reverse_dictionary(dictionary): + ''' + http://stackoverflow.com/questions/483666/python-reverse-inverse-a-mapping + http://stackoverflow.com/questions/25480089/right-way-to-initialize-an-ordereddict-using-its-constructor-such-that-it-retain + ''' + #print('type(dictionary): {0}'.format(type(dictionary))) + if type(dictionary) is collections.OrderedDict: + #print(type(dictionary)) + return collections.OrderedDict([(v, k) for k, v in dictionary.items()]) + else: + return {v: k for k, v in dictionary.items()} + +def is_token_in_pretrained_embeddings(token, all_pretrained_tokens, parameters): + #return token in all_pretrained_tokens or \ + return re.sub('\d', '0', token.lower()) in all_pretrained_tokens + + + +def remove_bio_from_label_name(label_name): + if label_name[:2] in ['B-', 'I-', 'E-', 'S-']: + #print (label_name[:2]) + new_label_name = label_name[2:] + else: + assert(label_name == 'O') + new_label_name = label_name + return new_label_name + + +def load_pretrained_token_embeddings(parameters): + file_input = codecs.open(parameters['token_pretrained_embedding_filepath'], 'r', 'UTF-8') + count = -1 + token_to_vector = {} + for cur_line in file_input: + count += 1 + #if count > 1000:break + cur_line = cur_line.strip() + cur_line = cur_line.split(' ') + if len(cur_line)==0:continue + token = cur_line[0] + vector = np.array([float(x) for x in cur_line[1:]]) + token_to_vector[token] = vector + file_input.close() + return token_to_vector #Dictionary of token-vectors + + +def load_tokens_from_pretrained_token_embeddings(parameters): + file_input = codecs.open(parameters['token_pretrained_embedding_filepath'], 'r', 'UTF-8') + count = -1 + tokens = set() + number_of_loaded_word_vectors = 0 + for cur_line in file_input: + count += 1 + cur_line = cur_line.strip() + cur_line = cur_line.split(' ') + if len(cur_line)==0:continue + token=cur_line[0] + tokens.add(token) + number_of_loaded_word_vectors += 1 + file_input.close() + return tokens + + +def pad_list(old_list, padding_size, padding_value): # ONE SIDED, might have issues for BIDIRECTIONAL LSTM BATCH NORMALIZATION + ''' + http://stackoverflow.com/questions/3438756/some-built-in-to-pad-a-list-in-python + Example: pad_list([6,2,3], 5, 0) returns [6,2,3,0,0] + ''' + assert padding_size >= len(old_list) + return old_list + [padding_value] * (padding_size-len(old_list)) + + +def get_parsed_conll_output(conll_output_filepath): + conll_output = [l.rstrip().replace('%','').replace(';','').replace(':', '').strip() for l in codecs.open(conll_output_filepath, 'r', 'utf8')] + parsed_output = {} + line = conll_output[1].split() + parsed_output['all'] = {'accuracy': float(line[1]), + 'precision': float(line[3]), + 'recall':float(line[5]), + 'f1':float(line[7])} + total_support = 0 + for line in conll_output[2:]: + line = line.split() + phi_type = line[0].replace('_', '-') + #print (phi_type) + # print (line) + support = int(line[7]) + total_support += support + parsed_output[phi_type] = {'precision': float(line[2]), + 'recall':float(line[4]), + 'f1':float(line[6]), + 'support':support} + parsed_output['all']['support'] = total_support + + print (parsed_output['all']) + return parsed_output + + +#z=get_parsed_conll_output("./RESULTS/CONLL-TEST/epoche_1.txt_conll_evaluation.txt") + + +#extract_from_the_tree("FIXED_I2B2_XML/i2b2_2012/training/28.xml","") + + #tokenize=word_tokenize(test) + # print tokenize +#write_all_files_into_one_file("FIXED_I2B2_XML/i2b2_2012/training/") # Add flag "Deal with double qotes as if they were marked -1,1 text global span-move+2 +#timeexp,spanlist = extract_from_the_tree("28.xml","28.xml.txt") +#z=map_time_exp_to_text(spanlist,timeexp) +#write_to_file_pseudo_conLL(z) + +#opening_path={'token_pretrained_embedding_filepath':'glove.6B.100d.txt',"freeze_token_embeddings" :'True'} +#tokens=load_tokens_from_pretrained_token_embeddings(opening_path) +#horrible_list=load_pretrained_token_embeddings(opening_path) + +#print horrible_list["cancer"] +#print tokens \ No newline at end of file diff --git a/code/model.py b/code/model.py index dcf4f0e..e09bf04 100644 --- a/code/model.py +++ b/code/model.py @@ -22,6 +22,20 @@ from feature_extraction.features import extract_features +# NEW + +import DatasetCliner_experimental as Exp + + +import tensorflow as tf +import entity_lstm as entity_model +import training_predict_LSTM +import pickle +import copy +import helper_dataset as hd +import shutil + + # python2 needs to convert to unicdode, but thats default for python3 if sys.version_info.major == 2: @@ -55,6 +69,10 @@ def log(self, out, model_file=None): else: with open(out, 'a') as f: write(f, '%s\n' % log) + + + def __log_str_NEURAL(self,model_file=None): + "" def __log_str(self, model_file=None): @@ -140,7 +158,13 @@ def __init__(self, use_lstm): Instantiate a ClinerModel object. @param use_lstm. Bool indicating whether to train a CRF or LSTM. + + """ + + + + print ("INIT TEST") self._use_lstm = use_lstm self._is_trained = False self._clf = None @@ -148,6 +172,14 @@ def __init__(self, use_lstm): self._training_files = None self._log = None self._text_feats = None + + + self._pretrained_dataset=None + self._pretrained_wordvectors=None + + self._current_model=None + self._parameters=None + @@ -173,12 +205,13 @@ def train(self, train_notes, val=[], test=[]): test_labels = [] if val: + print ("VAL") val_sents = flatten([n.getTokenizedSentences() for n in val]) val_labels = flatten([n.getTokenLabels() for n in val]) + self.train_fit(train_sents,train_labels,val_sents=val_sents,val_labels=val_labels,test_sents=test_sents,test_labels=test_labels) - self.train_fit(train_sents, train_labels, val_sents, val_labels, - test_sents=test_sents, test_labels=test_labels) else: + print ("NO DEV") self.train_fit(train_sents, train_labels, dev_split=0.1, test_sents=test_sents, test_labels=test_labels) @@ -202,7 +235,8 @@ def train_fit(self, train_sents, train_labels, val_sents=None, val_labels=None, self._time_train_begin = strftime("%Y-%m-%d %H:%M:%S", localtime()) # train classifier - voc, clf, dev_score, enabled_features = generic_train('all', + if self._use_lstm==False: + voc, clf, dev_score, enabled_features = generic_train('all', train_sents , train_labels , self._use_lstm , @@ -211,15 +245,43 @@ def train_fit(self, train_sents, train_labels, val_sents=None, val_labels=None, test_sents=test_sents , test_labels=test_labels , dev_split=dev_split ) - - self._is_trained = True - self._vocab = voc - self._clf = clf - self._score = dev_score - self._features = enabled_features + self._is_trained = True + self._vocab = voc + self._clf = clf + self._score = dev_score + self._features = enabled_features + # metadata + self._time_train_end = strftime("%Y-%m-%d %H:%M:%S", localtime()) + + + + + else: + print ("IN ERROR CHECK") + print (dev_split) + parameters,dataset,best = generic_train('all', + train_sents , + train_labels , + self._use_lstm , + val_sents=val_sents , + val_labels=val_labels , + test_sents=test_sents , + test_labels=test_labels , + dev_split=dev_split ) + self._is_trained = True + self.pretrained_dataset=dataset + self.parameters=parameters + self._score=best + self._time_train_end = strftime("%Y-%m-%d %H:%M:%S", localtime()) + print ("BEST EPOCH") + print (best) + #self._vocab = voc + #self._clf = clf + #self._score = dev_score + #self._features = enabled_features # metadata - self._time_train_end = strftime("%Y-%m-%d %H:%M:%S", localtime()) + #self._time_train_end = strftime("%Y-%m-%d %H:%M:%S", localtime()) def predict_classes_from_document(self, document): @@ -238,6 +300,9 @@ def predict_classes_from_document(self, document): def predict_classes(self, tokenized_sents): + + + """ ClinerModel::predict_classes() @@ -248,17 +313,46 @@ def predict_classes(self, tokenized_sents): @return List of predictions """ # Predict labels for prose - num_pred = generic_predict('all' , + print ("GENERIC PREDICT") + + self._use_lstm=True + + + if self._use_lstm==True: + if self.parameters==None: + self.parameters=hd.load_parameters_from_file("LSTM_parameters.txt") + + + if self._pretrained_dataset==None: + temp_pretrained_dataset_adress=self.parameters['model_folder']+os.sep+"dataset.pickle" + self.pretrained_dataset= pickle.load(open(temp_pretrained_dataset_adress, 'rb')) + + + + num_pred,model = generic_predict('all' , tokenized_sents , vocab = self._vocab , clf = self._clf , - use_lstm = self._use_lstm) + use_lstm = self._use_lstm, + pretrained_dataset=self._pretrained_dataset, + tokens_to_vec=self._pretrained_wordvector, + current_model=self._current_model, + parameters=self.parameters) + + self._current_model=model + + if self._use_lstm==True: + print ("USE LSTM") + iob_pred=num_pred + else:iob_pred = [ [id2tag[p] for p in seq] for seq in num_pred ] - iob_pred = [ [id2tag[p] for p in seq] for seq in num_pred ] return iob_pred + + print (id2tag) + ############################################################################ ### Lowest-level (interfaces to ML modules) ### @@ -311,15 +405,137 @@ def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, v if use_lstm: - ######## - # LSTM - ######## + print ("TESTING NEW DATSET OBJECT") + dataset = Exp.Dataset() + + parameters=hd.load_parameters_from_file("LSTM_parameters.txt") + parameters['use_pretrained_model']=False + + + + Datasets_tokens={} + Datasets_labels={} + + Datasets_tokens['train']=train_sents + Datasets_labels['train']=train_labels + + if val_sents!=None: + Datasets_tokens['valid']=val_sents + Datasets_labels['valid']=val_labels + + + + if test_sents!=[]: + Datasets_tokens['test']=test_sents + Datasets_labels['test']=test_labels + + dataset.load_dataset(Datasets_tokens,Datasets_labels,"",parameters) + pickle.dump(dataset, open(os.path.join(parameters['model_folder'], 'dataset.pickle'), 'wb')) + + print (Datasets_tokens['valid'][0]) + print (Datasets_tokens['test'][0]) + - sys.stdout.write('%s\n' % train_sents) - sys.stdout.write('%s\n' % train_labels) - sys.stdout.write('incorportate hierarchical LSTM\n') - exit() + parameters['Feature_vector_length']=dataset.feature_vector_size + parameters['use_features_before_final_lstm']=False + parameters['learning_rate']=0.005 + + + sess = tf.Session() + number_of_sent=list(range(len(dataset.token_indices['train']))) + + with sess.as_default(): + model=entity_model.EntityLSTM(dataset,parameters) + sess.run(tf.global_variables_initializer()) + model.load_pretrained_token_embeddings(sess, dataset,parameters) + epoch_number = -1 + transition_params_trained = np.random.rand(5+2,5+2) + values={} + values["best"]=0 + + f1_dictionary={} + f1_dictionary['best']=0 + + model_saver = tf.train.Saver(max_to_keep=100) + + print ("START TRAINING") + + parameters['conll_like_result_folder']='/tmp/cliner_eval_%d' % random.randint(0,256)+os.sep + + + test_temp = os.path.join(parameters['conll_like_result_folder'], 'test/') + train_temp = os.path.join(parameters['conll_like_result_folder'], 'train/') + valid_temp = os.path.join(parameters['conll_like_result_folder'], 'valid/') + + os.mkdir(parameters['conll_like_result_folder']) + os.mkdir(test_temp) + os.mkdir(train_temp) + os.mkdir(valid_temp) + + + + while epoch_number<90: + average_loss_per_phrase=0 + accuracy_per_phase=0 + step = 0 + + epoch_number += 1 + if epoch_number != 0: + sequence_numbers=list(range(len(dataset.token_indices['train']))) + random.shuffle(sequence_numbers) + for sequence_number in sequence_numbers: + loss,accuracy,transition_params_trained=training_predict_LSTM.train_step(sess, dataset, sequence_number, model) + average_loss_per_phrase+=loss + accuracy_per_phase+=accuracy + step += 1 + if step % 10 == 0: + print('Training {0:.2f}% done'.format(step/len(sequence_numbers)*100), end='\r', flush=True) + + model_saver.save(sess, os.path.join(parameters['model_folder'], 'model_{0:05d}.ckpt'.format(epoch_number))) + + + + total_loss=average_loss_per_phrase + total_accuracy=accuracy_per_phase + + average_loss_per_phrase=average_loss_per_phrase/len(number_of_sent) + accuracy_per_phase=accuracy_per_phase/len(number_of_sent) + + + if epoch_number>0: + "" + f1,predictions=training_predict_LSTM.prediction_step(sess,dataset,"test",model,epoch_number,parameters['conll_like_result_folder'],transition_params_trained) + f1_train,_=training_predict_LSTM.prediction_step(sess,dataset,"train", model,epoch_number,parameters['conll_like_result_folder'],transition_params_trained) + f1_valid,_=training_predict_LSTM.prediction_step(sess,dataset,"valid", model,epoch_number,parameters['conll_like_result_folder'],transition_params_trained) + + + correctly_predicted_tokens=training_predict_LSTM.compute_train_accuracy(parameters['conll_like_result_folder']+"valid"+os.sep+"epoche_"+str(epoch_number)+".txt") + + if f1_dictionary['best']