From d451778703a3fb1b5d9ac66a96d0719faac7cb72 Mon Sep 17 00:00:00 2001 From: wboag Date: Sun, 11 Feb 2018 18:49:49 -0500 Subject: [PATCH] fixed the python2 crf --- code/DatasetCliner_experimental.py | 2 +- code/model.py | 238 +++++++++-------------------- code/predict.py | 44 +++--- code/tools.py | 70 +++++++++ code/train.py | 11 +- requirements.txt | 2 - 6 files changed, 172 insertions(+), 195 deletions(-) diff --git a/code/DatasetCliner_experimental.py b/code/DatasetCliner_experimental.py index 8d949f2..d938ccb 100644 --- a/code/DatasetCliner_experimental.py +++ b/code/DatasetCliner_experimental.py @@ -204,7 +204,7 @@ def load_dataset(self,avaliable_datasets_sent,avaliable_datasets_labels, dataset dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy' ''' start_time = time.time() - print('Load dataset... ', end='', flush=True) + print('Load dataset... \n') if parameters['token_pretrained_embedding_filepath'] != '': if token_to_vector==None: token_to_vector = hd.load_pretrained_token_embeddings(parameters) diff --git a/code/model.py b/code/model.py index e09bf04..876c97f 100644 --- a/code/model.py +++ b/code/model.py @@ -16,35 +16,20 @@ from time import localtime, strftime from collections import defaultdict -from machine_learning import crf -from notes.documents import labels as tag2id, id2tag -from tools import flatten, save_list_structure, reconstruct_list -from feature_extraction.features import extract_features - - -# NEW - -import DatasetCliner_experimental as Exp - - -import tensorflow as tf -import entity_lstm as entity_model -import training_predict_LSTM -import pickle -import copy -import helper_dataset as hd -import shutil +from notes.documents import labels as tag2id, id2tag +from tools import flatten, save_list_structure, reconstruct_list +from tools import print_str, print_vec, print_files # python2 needs to convert to unicdode, but thats default for python3 if sys.version_info.major == 2: - func = unicode + tostr = unicode else: - func = str + tostr = str def write(f, s): - f.write(func(s)) + f.write(tostr(s)) @@ -158,13 +143,8 @@ def __init__(self, use_lstm): Instantiate a ClinerModel object. @param use_lstm. Bool indicating whether to train a CRF or LSTM. - - """ - - - print ("INIT TEST") self._use_lstm = use_lstm self._is_trained = False self._clf = None @@ -172,13 +152,25 @@ def __init__(self, use_lstm): self._training_files = None self._log = None self._text_feats = None - - - self._pretrained_dataset=None - self._pretrained_wordvectors=None - - self._current_model=None - self._parameters=None + + # Import the tools for either CRF or LSTM + if use_lstm: + # NEW + import DatasetCliner_experimental as Exp + + import tensorflow as tf + import entity_lstm as entity_model + import training_predict_LSTM + import pickle + import copy + import helper_dataset as hd + import shutil + + self._pretrained_dataset=None + self._pretrained_wordvectors=None + + self._current_model=None + self._parameters=None @@ -300,9 +292,6 @@ def predict_classes_from_document(self, document): def predict_classes(self, tokenized_sents): - - - """ ClinerModel::predict_classes() @@ -312,53 +301,46 @@ def predict_classes(self, tokenized_sents): into words @return List of predictions """ - # Predict labels for prose - print ("GENERIC PREDICT") - - self._use_lstm=True - - - if self._use_lstm==True: - if self.parameters==None: - self.parameters=hd.load_parameters_from_file("LSTM_parameters.txt") - - - if self._pretrained_dataset==None: - temp_pretrained_dataset_adress=self.parameters['model_folder']+os.sep+"dataset.pickle" - self.pretrained_dataset= pickle.load(open(temp_pretrained_dataset_adress, 'rb')) - - - num_pred,model = generic_predict('all' , - tokenized_sents , - vocab = self._vocab , - clf = self._clf , - use_lstm = self._use_lstm, - pretrained_dataset=self._pretrained_dataset, - tokens_to_vec=self._pretrained_wordvector, - current_model=self._current_model, - parameters=self.parameters) - - self._current_model=model - - if self._use_lstm==True: - print ("USE LSTM") - iob_pred=num_pred - else:iob_pred = [ [id2tag[p] for p in seq] for seq in num_pred ] - + hyperparams = {} + + # Predict labels for prose + if self._use_lstm: + if self.parameters==None: + hyperprams['parameters'] = hd.load_parameters_from_file("LSTM_parameters.txt") + + if self._pretrained_dataset==None: + temp_pretrained_dataset = os.path.join(hyperparams['parameters']['model_folder'], + "dataset.pickle") + hyperparams['pretrained_dataset'] = pickle.load(open(temp_pretrained_dataset_adress, 'rb')) + + vectorized_pred = generic_predict('all' , + tokenized_sents , + vocab = self._vocab , + clf = self._clf , + use_lstm = self._use_lstm, + hyperparams = hyperparams) + #pretrained_dataset=self._pretrained_dataset, + #tokens_to_vec=self._pretrained_wordvector, + #current_model=self._current_model, + #parameters=self.parameters) + + #self._current_model=model + + if self._use_lstm: + iob_pred = vectorized_pred + else: + iob_pred = [ [id2tag[p] for p in seq] for seq in vectorized_pred ] return iob_pred - - print (id2tag) - ############################################################################ ### Lowest-level (interfaces to ML modules) ### ############################################################################ -def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, val_labels=None, test_sents=[], test_labels=[], dev_split=None): +def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, val_labels=None, test_sents=None, test_labels=None, dev_split=None): ''' generic_train() @@ -420,14 +402,12 @@ def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, v Datasets_labels['train']=train_labels if val_sents!=None: - Datasets_tokens['valid']=val_sents - Datasets_labels['valid']=val_labels - + Datasets_tokens['valid']=val_sents + Datasets_labels['valid']=val_labels - - if test_sents!=[]: - Datasets_tokens['test']=test_sents - Datasets_labels['test']=test_labels + if test_sents!=None: + Datasets_tokens['test']=test_sents + Datasets_labels['test']=test_labels dataset.load_dataset(Datasets_tokens,Datasets_labels,"",parameters) pickle.dump(dataset, open(os.path.join(parameters['model_folder'], 'dataset.pickle'), 'wb')) @@ -490,7 +470,7 @@ def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, v accuracy_per_phase+=accuracy step += 1 if step % 10 == 0: - print('Training {0:.2f}% done'.format(step/len(sequence_numbers)*100), end='\r', flush=True) + print('Training {0:.2f}% done\n'.format(step/len(sequence_numbers)*100)) model_saver.save(sess, os.path.join(parameters['model_folder'], 'model_{0:05d}.ckpt'.format(epoch_number))) @@ -541,6 +521,8 @@ def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, v # CRF ######## + from feature_extraction.features import extract_features + # vectorize tokenized sentences text_features = extract_features(train_sents) # type(text_features): @@ -591,6 +573,9 @@ def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, v save_list_structure(test_text_features)) # vectorize test Y test_Y = [ [tag2id[y] for y in y_seq] for y_seq in test_labels ] + else: + test_X = None + test_Y = None sys.stdout.write('\ttraining classifiers %s\n' % p_or_n) @@ -602,6 +587,7 @@ def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, v test_X_ids=test_X, test_Y_ids=test_Y) else: # train using crf + from machine_learning import crf clf, dev_score = crf.train(X_feats, Y_labels, val_X=val_X, val_Y=val_Y, test_X=test_X, test_Y=test_Y) @@ -609,7 +595,8 @@ def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, v -def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, pretrained_dataset=None,tokens_to_vec=None, current_model=None, parameters=None): +#def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, pretrained_dataset=None,tokens_to_vec=None, current_model=None, parameters=None): +def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, hyperparams): ''' generic_predict() @@ -623,12 +610,11 @@ def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, pretrained_da @param use_lstm. Bool indicating whether clf is a CRF or LSTM. ''' # use_lstm=self._use_lstm - if use_lstm==True: + if use_lstm: #parameters=hd.load_parameters_from_file("LSTM_parameters.txt") parameters['use_pretrained_model']=True - #model_folder="./models/NN_models" predictions=[] sys.stdout.write('\n use_lstm \n') @@ -638,20 +624,16 @@ def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, pretrained_da for idx,x in enumerate(fictional_labels): for val_id,value in enumerate(x): fictional_labels[idx][val_id]='O' - Datasets_tokens={} Datasets_labels={} - Datasets_tokens['deploy']=tokenized_sents Datasets_labels['deploy']=fictional_labels - token_to_vector=dataset.load_dataset(Datasets_tokens, Datasets_labels, "", parameters,token_to_vector=tokens_to_vec, pretrained_dataset=pretrained_dataset) print (dataset.token_indices.keys()) - parameters['Feature_vector_length']=dataset.feature_vector_size parameters['use_features_before_final_lstm']=False @@ -659,9 +641,6 @@ def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, pretrained_da dataset.update_dataset("", ['deploy'],Datasets_tokens,Datasets_labels) - - - del Datasets_tokens del Datasets_labels @@ -704,7 +683,7 @@ def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, pretrained_da sys.stdout.write('\tvectorizing words %s\n' % p_or_n) if use_lstm: - "" + print 'elena didnt do shit here' # vectorize tokenized sentences #X = [] #for sent in tokenized_sents: @@ -716,6 +695,8 @@ def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, pretrained_da # id_seq.append(vocab['oov']) # X.append(id_seq) else: + from feature_extraction.features import extract_features + # vectorize validation X text_features = extract_features(tokenized_sents) flat_X_feats = vocab.transform( flatten(text_features) ) @@ -724,86 +705,15 @@ def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, pretrained_da sys.stdout.write('\tpredicting labels %s\n' % p_or_n) # Predict labels - use_lstm==True if use_lstm: print ("TEST_PREDICT") exit() else: + from machine_learning import crf predictions = crf.predict(clf, X) # Format labels from output return predictions - - -def print_files(f, file_names): - ''' - print_files() - - Pretty formatting for listing the training files in a - log. - - @param f. An open file stream to write to. - @param file_names. A list of filename strings. - ''' - COLUMNS = 4 - file_names = sorted(file_names) - start = 0 - for row in range(int(math.ceil(float(len(file_names))/COLUMNS))): - write(f, u'\t\t') - for featname in file_names[start:start+COLUMNS]: - write(f, '%-15s' % featname) - write(f, u'\n') - start += COLUMNS - - - -def print_vec(f, label, vec): - ''' - print_vec() - - Pretty formatting for displaying a vector of numbers in a log. - - @param f. An open file stream to write to. - @param label. A description of the numbers (e.g. "recall"). - @param vec. A numpy array of the numbers to display. - ''' - COLUMNS = 7 - start = 0 - write(f, '\t%-10s: ' % label) - if type(vec) != type([]): - vec = vec.tolist() - for row in range(int(math.ceil(float(len(vec))/COLUMNS))): - for featname in vec[start:start+COLUMNS]: - write(f, '%7.3f' % featname) - write(f, u'\n') - start += COLUMNS - - - -def print_str(f, label, names): - - ''' - print_str() - Pretty formatting for displaying a list of strings in a log - @param f. An open file stream to write to. - @param label. A description of the numbers (e.g. "recall"). - @param names. A list of strings. - ''' - COLUMNS = 4 - start = 0 - for row in range(int(math.ceil(float(len(names))/COLUMNS))): - if row == 0: - write(f, '\t%-10s: ' % label) - else: - write(f, '\t%-10s ' % '') - - for featname in names[start:start+COLUMNS]: - write(f, '%-16s ' % featname) - - write(f, u'\n') - start += COLUMNS - - diff --git a/code/predict.py b/code/predict.py index 087db5e..fc57d8f 100644 --- a/code/predict.py +++ b/code/predict.py @@ -13,14 +13,12 @@ import argparse import itertools import pickle -import helper_dataset as hd -import DatasetCliner_experimental as Exp -import entity_lstm as entity_model import tools from model import ClinerModel, write from notes.documents import Document import copy + def main(): parser = argparse.ArgumentParser() @@ -46,23 +44,23 @@ def main(): # Error check: Ensure that file paths are specified if not args.txt: - sys.stderr.write('\n\tError: Must provide text files\n\n') parser.print_help(sys.stderr) + sys.stderr.write('\n\tError: Must provide text files\n\n') sys.stderr.write('\n') exit(1) if not args.output: - sys.stderr.write('\n\tError: Must provide output directory\n\n') parser.print_help(sys.stderr) + sys.stderr.write('\n\tError: Must provide output directory\n\n') sys.stderr.write('\n') exit(1) if not args.model: - sys.stderr.write('\n\tError: Must provide path to model\n\n') parser.print_help(sys.stderr) + sys.stderr.write('\n\tError: Must provide path to model\n\n') sys.stderr.write('\n') exit(1) if not os.path.exists(args.model): - sys.stderr.write('\n\tError: ClinerModel does not exist: %s\n\n' % args.model) parser.print_help(sys.stderr) + sys.stderr.write('\n\tError: ClinerModel does not exist: %s\n\n' % args.model) sys.stderr.write('\n') exit(1) @@ -73,19 +71,17 @@ def main(): if args.format: format = args.format else: + parser.print_help(sys.stderr) sys.stderr.write('\n\tERROR: must provide "format" argument\n\n') - exit() + exit(1) - # Predict - - - + # Predict predict(files, args.model, args.output, format=format) -def predict(files, model_path, output_dir, format,use_lstm=True): +def predict(files, model_path, output_dir, format, use_lstm=True): # Must specify output format if format not in ['i2b2']: @@ -93,18 +89,20 @@ def predict(files, model_path, output_dir, format,use_lstm=True): sys.stderr.write('\tAvailable formats: i2b2\n') sys.stderr.write('\n') exit(1) - parameters=hd.load_parameters_from_file("LSTM_parameters.txt") - parameters['use_pretrained_model']=True - + # Load model #if use_lstm==False: with open(model_path, 'rb') as f: model = pickle.load(f) - if use_lstm==True: - #model._pretrained_dataset=None - #model._pretrained_wordvectors=None + if model._use_lstm: + import helper_dataset as hd + import DatasetCliner_experimental as Exp + import entity_lstm as entity_model + parameters=hd.load_parameters_from_file("LSTM_parameters.txt") + parameters['use_pretrained_model']=True + temp_pretrained_dataset_adress=parameters['model_folder']+os.sep+"dataset.pickle" model._pretrained_dataset = pickle.load(open(temp_pretrained_dataset_adress, 'rb')) model._pretrained_wordvector=hd.load_pretrained_token_embeddings(parameters) @@ -157,9 +155,6 @@ def predict(files, model_path, output_dir, format,use_lstm=True): n = len(files) - - - for i,txt in enumerate(sorted(files)): note = Document(txt) @@ -170,8 +165,8 @@ def predict(files, model_path, output_dir, format,use_lstm=True): #''' if os.path.exists(out_path): - #print('\tWARNING: prediction file already exists (%s)' % out_path) - continue + print('\tWARNING: prediction file already exists (%s)' % out_path) + #continue #''' sys.stdout.write('%s\n' % '-' * 30) @@ -179,7 +174,6 @@ def predict(files, model_path, output_dir, format,use_lstm=True): sys.stdout.write('\t%s\n\n' % txt) # Predict concept labels - sys.stdout.write('\n Calling Generic Predictor \n') labels = model.predict_classes_from_document(note) # Get predictions in proper format diff --git a/code/tools.py b/code/tools.py index 176fe4e..a6597d4 100644 --- a/code/tools.py +++ b/code/tools.py @@ -209,6 +209,76 @@ def prose_partition(tokenized_sents, labels=None): +def print_files(f, file_names): + ''' + print_files() + + Pretty formatting for listing the training files in a + log. + + @param f. An open file stream to write to. + @param file_names. A list of filename strings. + ''' + COLUMNS = 4 + file_names = sorted(file_names) + start = 0 + for row in range(int(math.ceil(float(len(file_names))/COLUMNS))): + write(f, u'\t\t') + for featname in file_names[start:start+COLUMNS]: + write(f, '%-15s' % featname) + write(f, u'\n') + start += COLUMNS + + + +def print_vec(f, label, vec): + ''' + print_vec() + + Pretty formatting for displaying a vector of numbers in a log. + + @param f. An open file stream to write to. + @param label. A description of the numbers (e.g. "recall"). + @param vec. A numpy array of the numbers to display. + ''' + COLUMNS = 7 + start = 0 + write(f, '\t%-10s: ' % label) + if type(vec) != type([]): + vec = vec.tolist() + for row in range(int(math.ceil(float(len(vec))/COLUMNS))): + for featname in vec[start:start+COLUMNS]: + write(f, '%7.3f' % featname) + write(f, u'\n') + start += COLUMNS + + + +def print_str(f, label, names): + + ''' + print_str() + Pretty formatting for displaying a list of strings in a log + @param f. An open file stream to write to. + @param label. A description of the numbers (e.g. "recall"). + @param names. A list of strings. + ''' + COLUMNS = 4 + start = 0 + for row in range(int(math.ceil(float(len(names))/COLUMNS))): + if row == 0: + write(f, '\t%-10s: ' % label) + else: + write(f, '\t%-10s ' % '') + + for featname in names[start:start+COLUMNS]: + write(f, '%-16s ' % featname) + + write(f, u'\n') + start += COLUMNS + + + ############################################################# # Quick-and-Dirty evaluation of performance ############################################################# diff --git a/code/train.py b/code/train.py index 419051a..21e7f42 100644 --- a/code/train.py +++ b/code/train.py @@ -62,7 +62,7 @@ def main(): dest = "use_lstm", help = "Whether to use an LSTM model", action = 'store_true', - default = True + default = False ) parser.add_argument("--format", dest = "format", @@ -75,19 +75,23 @@ def main(): # Error check: Ensure that file paths are specified if not args.txt: + parser.print_help(sys.stderr) sys.stderr.write('\n\tError: Must provide text files\n') sys.stderr.write('\n') exit(1) if not args.con: + parser.print_help(sys.stderr) sys.stderr.write('\n\tError: Must provide annotations for text files\n') sys.stderr.write('\n') exit(1) if not args.model: + parser.print_help(sys.stderr) sys.stderr.write('\n\tError: Must provide valid path to store model\n') sys.stderr.write('\n') exit(1) modeldir = os.path.dirname(args.model) if (not os.path.exists(modeldir)) and (modeldir != ''): + parser.print_help(sys.stderr) sys.stderr.write('\n\tError: Model dir does not exist: %s\n' % modeldir) sys.stderr.write('\n') exit(1) @@ -187,9 +191,10 @@ def train(training_list, model_path, format, use_lstm, logfile=None, val=[], tes with open(model_path, "wb") as m_file: pickle.dump(model, m_file) - # model.log(logfile , model_file=model_path) - #model.log(sys.stdout, model_file=model_path) + model.log(logfile , model_file=model_path) + model.log(sys.stdout, model_file=model_path) + if __name__ == '__main__': main() diff --git a/requirements.txt b/requirements.txt index efc8a7a..9b83111 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -wheel nltk python-crfsuite numpy @@ -6,4 +5,3 @@ scipy scikit-learn marisa-trie repoze.lru -py4j