diff --git a/README.rst b/README.rst index 69d1315..3766321 100644 --- a/README.rst +++ b/README.rst @@ -20,13 +20,13 @@ Installation -------- - > pip install -r requirements.txt + $ pip install -r requirements.txt - > wget http://text-machine.cs.uml.edu/cliner/models/silver.model + $ wget http://text-machine.cs.uml.edu/cliner/models/silver.crf - > mv silver.model models/silver.model + $ mv silver.crf models/silver.crf - > cliner predict --txt examples/ex_doc.txt --out data/predictions --model models/silver.model --format i2b2 + $ cliner predict --txt examples/ex_doc.txt --out data/predictions --model models/silver.crf --format i2b2 Out-of-the-Box Model @@ -34,7 +34,7 @@ Out-of-the-Box Model Although i2b2 licensing prevents us from releasing our cliner models trained on i2b2 data, we generated some comparable models from automatically-annotated MIMIC II text. -This silver MIMIC model can be found at http://text-machine.cs.uml.edu/cliner/models/silver.model +This silver MIMIC model can be found at http://text-machine.cs.uml.edu/cliner/models/silver.crf Example Data diff --git a/code/DatasetCliner_experimental.py b/code/DatasetCliner_experimental.py index 8d949f2..d938ccb 100644 --- a/code/DatasetCliner_experimental.py +++ b/code/DatasetCliner_experimental.py @@ -204,7 +204,7 @@ def load_dataset(self,avaliable_datasets_sent,avaliable_datasets_labels, dataset dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy' ''' start_time = time.time() - print('Load dataset... ', end='', flush=True) + print('Load dataset... \n') if parameters['token_pretrained_embedding_filepath'] != '': if token_to_vector==None: token_to_vector = hd.load_pretrained_token_embeddings(parameters) diff --git a/code/feature_extraction/word_features.py b/code/feature_extraction/word_features.py index cd4d270..6d3fbb9 100644 --- a/code/feature_extraction/word_features.py +++ b/code/feature_extraction/word_features.py @@ -37,7 +37,10 @@ def feature_length(word): return {('length', ''): len(word)} def feature_stem_porter(word): - return {('stem_porter', porter_st.stem(word)): 1} + try: + return {('stem_porter', porter_st.stem(word)): 1} + except Exception, e: + return {} def feature_mitre(word): features = {} diff --git a/code/model.py b/code/model.py index e09bf04..876c97f 100644 --- a/code/model.py +++ b/code/model.py @@ -16,35 +16,20 @@ from time import localtime, strftime from collections import defaultdict -from machine_learning import crf -from notes.documents import labels as tag2id, id2tag -from tools import flatten, save_list_structure, reconstruct_list -from feature_extraction.features import extract_features - - -# NEW - -import DatasetCliner_experimental as Exp - - -import tensorflow as tf -import entity_lstm as entity_model -import training_predict_LSTM -import pickle -import copy -import helper_dataset as hd -import shutil +from notes.documents import labels as tag2id, id2tag +from tools import flatten, save_list_structure, reconstruct_list +from tools import print_str, print_vec, print_files # python2 needs to convert to unicdode, but thats default for python3 if sys.version_info.major == 2: - func = unicode + tostr = unicode else: - func = str + tostr = str def write(f, s): - f.write(func(s)) + f.write(tostr(s)) @@ -158,13 +143,8 @@ def __init__(self, use_lstm): Instantiate a ClinerModel object. @param use_lstm. Bool indicating whether to train a CRF or LSTM. - - """ - - - print ("INIT TEST") self._use_lstm = use_lstm self._is_trained = False self._clf = None @@ -172,13 +152,25 @@ def __init__(self, use_lstm): self._training_files = None self._log = None self._text_feats = None - - - self._pretrained_dataset=None - self._pretrained_wordvectors=None - - self._current_model=None - self._parameters=None + + # Import the tools for either CRF or LSTM + if use_lstm: + # NEW + import DatasetCliner_experimental as Exp + + import tensorflow as tf + import entity_lstm as entity_model + import training_predict_LSTM + import pickle + import copy + import helper_dataset as hd + import shutil + + self._pretrained_dataset=None + self._pretrained_wordvectors=None + + self._current_model=None + self._parameters=None @@ -300,9 +292,6 @@ def predict_classes_from_document(self, document): def predict_classes(self, tokenized_sents): - - - """ ClinerModel::predict_classes() @@ -312,53 +301,46 @@ def predict_classes(self, tokenized_sents): into words @return List of predictions """ - # Predict labels for prose - print ("GENERIC PREDICT") - - self._use_lstm=True - - - if self._use_lstm==True: - if self.parameters==None: - self.parameters=hd.load_parameters_from_file("LSTM_parameters.txt") - - - if self._pretrained_dataset==None: - temp_pretrained_dataset_adress=self.parameters['model_folder']+os.sep+"dataset.pickle" - self.pretrained_dataset= pickle.load(open(temp_pretrained_dataset_adress, 'rb')) - - - num_pred,model = generic_predict('all' , - tokenized_sents , - vocab = self._vocab , - clf = self._clf , - use_lstm = self._use_lstm, - pretrained_dataset=self._pretrained_dataset, - tokens_to_vec=self._pretrained_wordvector, - current_model=self._current_model, - parameters=self.parameters) - - self._current_model=model - - if self._use_lstm==True: - print ("USE LSTM") - iob_pred=num_pred - else:iob_pred = [ [id2tag[p] for p in seq] for seq in num_pred ] - + hyperparams = {} + + # Predict labels for prose + if self._use_lstm: + if self.parameters==None: + hyperprams['parameters'] = hd.load_parameters_from_file("LSTM_parameters.txt") + + if self._pretrained_dataset==None: + temp_pretrained_dataset = os.path.join(hyperparams['parameters']['model_folder'], + "dataset.pickle") + hyperparams['pretrained_dataset'] = pickle.load(open(temp_pretrained_dataset_adress, 'rb')) + + vectorized_pred = generic_predict('all' , + tokenized_sents , + vocab = self._vocab , + clf = self._clf , + use_lstm = self._use_lstm, + hyperparams = hyperparams) + #pretrained_dataset=self._pretrained_dataset, + #tokens_to_vec=self._pretrained_wordvector, + #current_model=self._current_model, + #parameters=self.parameters) + + #self._current_model=model + + if self._use_lstm: + iob_pred = vectorized_pred + else: + iob_pred = [ [id2tag[p] for p in seq] for seq in vectorized_pred ] return iob_pred - - print (id2tag) - ############################################################################ ### Lowest-level (interfaces to ML modules) ### ############################################################################ -def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, val_labels=None, test_sents=[], test_labels=[], dev_split=None): +def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, val_labels=None, test_sents=None, test_labels=None, dev_split=None): ''' generic_train() @@ -420,14 +402,12 @@ def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, v Datasets_labels['train']=train_labels if val_sents!=None: - Datasets_tokens['valid']=val_sents - Datasets_labels['valid']=val_labels - + Datasets_tokens['valid']=val_sents + Datasets_labels['valid']=val_labels - - if test_sents!=[]: - Datasets_tokens['test']=test_sents - Datasets_labels['test']=test_labels + if test_sents!=None: + Datasets_tokens['test']=test_sents + Datasets_labels['test']=test_labels dataset.load_dataset(Datasets_tokens,Datasets_labels,"",parameters) pickle.dump(dataset, open(os.path.join(parameters['model_folder'], 'dataset.pickle'), 'wb')) @@ -490,7 +470,7 @@ def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, v accuracy_per_phase+=accuracy step += 1 if step % 10 == 0: - print('Training {0:.2f}% done'.format(step/len(sequence_numbers)*100), end='\r', flush=True) + print('Training {0:.2f}% done\n'.format(step/len(sequence_numbers)*100)) model_saver.save(sess, os.path.join(parameters['model_folder'], 'model_{0:05d}.ckpt'.format(epoch_number))) @@ -541,6 +521,8 @@ def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, v # CRF ######## + from feature_extraction.features import extract_features + # vectorize tokenized sentences text_features = extract_features(train_sents) # type(text_features): @@ -591,6 +573,9 @@ def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, v save_list_structure(test_text_features)) # vectorize test Y test_Y = [ [tag2id[y] for y in y_seq] for y_seq in test_labels ] + else: + test_X = None + test_Y = None sys.stdout.write('\ttraining classifiers %s\n' % p_or_n) @@ -602,6 +587,7 @@ def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, v test_X_ids=test_X, test_Y_ids=test_Y) else: # train using crf + from machine_learning import crf clf, dev_score = crf.train(X_feats, Y_labels, val_X=val_X, val_Y=val_Y, test_X=test_X, test_Y=test_Y) @@ -609,7 +595,8 @@ def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, v -def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, pretrained_dataset=None,tokens_to_vec=None, current_model=None, parameters=None): +#def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, pretrained_dataset=None,tokens_to_vec=None, current_model=None, parameters=None): +def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, hyperparams): ''' generic_predict() @@ -623,12 +610,11 @@ def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, pretrained_da @param use_lstm. Bool indicating whether clf is a CRF or LSTM. ''' # use_lstm=self._use_lstm - if use_lstm==True: + if use_lstm: #parameters=hd.load_parameters_from_file("LSTM_parameters.txt") parameters['use_pretrained_model']=True - #model_folder="./models/NN_models" predictions=[] sys.stdout.write('\n use_lstm \n') @@ -638,20 +624,16 @@ def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, pretrained_da for idx,x in enumerate(fictional_labels): for val_id,value in enumerate(x): fictional_labels[idx][val_id]='O' - Datasets_tokens={} Datasets_labels={} - Datasets_tokens['deploy']=tokenized_sents Datasets_labels['deploy']=fictional_labels - token_to_vector=dataset.load_dataset(Datasets_tokens, Datasets_labels, "", parameters,token_to_vector=tokens_to_vec, pretrained_dataset=pretrained_dataset) print (dataset.token_indices.keys()) - parameters['Feature_vector_length']=dataset.feature_vector_size parameters['use_features_before_final_lstm']=False @@ -659,9 +641,6 @@ def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, pretrained_da dataset.update_dataset("", ['deploy'],Datasets_tokens,Datasets_labels) - - - del Datasets_tokens del Datasets_labels @@ -704,7 +683,7 @@ def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, pretrained_da sys.stdout.write('\tvectorizing words %s\n' % p_or_n) if use_lstm: - "" + print 'elena didnt do shit here' # vectorize tokenized sentences #X = [] #for sent in tokenized_sents: @@ -716,6 +695,8 @@ def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, pretrained_da # id_seq.append(vocab['oov']) # X.append(id_seq) else: + from feature_extraction.features import extract_features + # vectorize validation X text_features = extract_features(tokenized_sents) flat_X_feats = vocab.transform( flatten(text_features) ) @@ -724,86 +705,15 @@ def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, pretrained_da sys.stdout.write('\tpredicting labels %s\n' % p_or_n) # Predict labels - use_lstm==True if use_lstm: print ("TEST_PREDICT") exit() else: + from machine_learning import crf predictions = crf.predict(clf, X) # Format labels from output return predictions - - -def print_files(f, file_names): - ''' - print_files() - - Pretty formatting for listing the training files in a - log. - - @param f. An open file stream to write to. - @param file_names. A list of filename strings. - ''' - COLUMNS = 4 - file_names = sorted(file_names) - start = 0 - for row in range(int(math.ceil(float(len(file_names))/COLUMNS))): - write(f, u'\t\t') - for featname in file_names[start:start+COLUMNS]: - write(f, '%-15s' % featname) - write(f, u'\n') - start += COLUMNS - - - -def print_vec(f, label, vec): - ''' - print_vec() - - Pretty formatting for displaying a vector of numbers in a log. - - @param f. An open file stream to write to. - @param label. A description of the numbers (e.g. "recall"). - @param vec. A numpy array of the numbers to display. - ''' - COLUMNS = 7 - start = 0 - write(f, '\t%-10s: ' % label) - if type(vec) != type([]): - vec = vec.tolist() - for row in range(int(math.ceil(float(len(vec))/COLUMNS))): - for featname in vec[start:start+COLUMNS]: - write(f, '%7.3f' % featname) - write(f, u'\n') - start += COLUMNS - - - -def print_str(f, label, names): - - ''' - print_str() - Pretty formatting for displaying a list of strings in a log - @param f. An open file stream to write to. - @param label. A description of the numbers (e.g. "recall"). - @param names. A list of strings. - ''' - COLUMNS = 4 - start = 0 - for row in range(int(math.ceil(float(len(names))/COLUMNS))): - if row == 0: - write(f, '\t%-10s: ' % label) - else: - write(f, '\t%-10s ' % '') - - for featname in names[start:start+COLUMNS]: - write(f, '%-16s ' % featname) - - write(f, u'\n') - start += COLUMNS - - diff --git a/code/predict.py b/code/predict.py index aa2d435..e47f3b8 100644 --- a/code/predict.py +++ b/code/predict.py @@ -13,13 +13,12 @@ import argparse import itertools import pickle -import helper_dataset as hd -import DatasetCliner_experimental as Exp -import entity_lstm as entity_model + import tools from model import ClinerModel, write from notes.documents import Document import copy + def main(): parser = argparse.ArgumentParser() @@ -45,23 +44,23 @@ def main(): # Error check: Ensure that file paths are specified if not args.txt: - sys.stderr.write('\n\tError: Must provide text files\n\n') parser.print_help(sys.stderr) + sys.stderr.write('\n\tError: Must provide text files\n\n') sys.stderr.write('\n') exit(1) if not args.output: - sys.stderr.write('\n\tError: Must provide output directory\n\n') parser.print_help(sys.stderr) + sys.stderr.write('\n\tError: Must provide output directory\n\n') sys.stderr.write('\n') exit(1) if not args.model: - sys.stderr.write('\n\tError: Must provide path to model\n\n') parser.print_help(sys.stderr) + sys.stderr.write('\n\tError: Must provide path to model\n\n') sys.stderr.write('\n') exit(1) if not os.path.exists(args.model): - sys.stderr.write('\n\tError: ClinerModel does not exist: %s\n\n' % args.model) parser.print_help(sys.stderr) + sys.stderr.write('\n\tError: ClinerModel does not exist: %s\n\n' % args.model) sys.stderr.write('\n') exit(1) @@ -72,19 +71,17 @@ def main(): if args.format: format = args.format else: + parser.print_help(sys.stderr) sys.stderr.write('\n\tERROR: must provide "format" argument\n\n') - exit() + exit(1) - # Predict - - - + # Predict predict(files, args.model, args.output, format=format) -def predict(files, model_path, output_dir, format,use_lstm=True): +def predict(files, model_path, output_dir, format, use_lstm=True): # Must specify output format if format not in ['i2b2']: @@ -92,19 +89,21 @@ def predict(files, model_path, output_dir, format,use_lstm=True): sys.stderr.write('\tAvailable formats: i2b2\n') sys.stderr.write('\n') exit(1) - parameters=hd.load_parameters_from_file("LSTM_parameters.txt") - parameters['use_pretrained_model']=True - + # Load model #if use_lstm==False: with open(model_path, 'rb') as f: model = pickle.load(f,encoding = 'latin1') - if use_lstm==True: - #model._pretrained_dataset=None - #model._pretrained_wordvectors=None + if model._use_lstm: + import helper_dataset as hd + import DatasetCliner_experimental as Exp + import entity_lstm as entity_model + parameters=hd.load_parameters_from_file("LSTM_parameters.txt") + parameters['use_pretrained_model']=True + temp_pretrained_dataset_adress=parameters['model_folder']+os.sep+"dataset.pickle" model._pretrained_dataset = pickle.load(open(temp_pretrained_dataset_adress, 'rb')) model._pretrained_wordvector=hd.load_pretrained_token_embeddings(parameters) @@ -158,9 +157,6 @@ def predict(files, model_path, output_dir, format,use_lstm=True): n = len(files) - - - for i,txt in enumerate(sorted(files)): note = Document(txt) @@ -171,16 +167,15 @@ def predict(files, model_path, output_dir, format,use_lstm=True): #''' if os.path.exists(out_path): - #print('\tWARNING: prediction file already exists (%s)' % out_path) - continue + print('\tWARNING: prediction file already exists (%s)' % out_path) + #continue #''' - sys.stdout.write('%s\n' % '-' * 30) + sys.stdout.write('%s\n' % ('-' * 30)) sys.stdout.write('\n\t%d of %d\n' % (i+1,n)) sys.stdout.write('\t%s\n\n' % txt) # Predict concept labels - sys.stdout.write('\n Calling Generic Predictor \n') labels = model.predict_classes_from_document(note) # Get predictions in proper format diff --git a/code/tools.py b/code/tools.py index 176fe4e..8a3a094 100644 --- a/code/tools.py +++ b/code/tools.py @@ -10,6 +10,7 @@ import os import errno import string +import math import re import pickle import numpy as np @@ -209,6 +210,76 @@ def prose_partition(tokenized_sents, labels=None): +def print_files(f, file_names): + ''' + print_files() + + Pretty formatting for listing the training files in a + log. + + @param f. An open file stream to write to. + @param file_names. A list of filename strings. + ''' + COLUMNS = 4 + file_names = sorted(file_names) + start = 0 + for row in range(int(math.ceil(float(len(file_names))/COLUMNS))): + write(f, u'\t\t') + for featname in file_names[start:start+COLUMNS]: + write(f, '%-15s' % featname) + write(f, u'\n') + start += COLUMNS + + + +def print_vec(f, label, vec): + ''' + print_vec() + + Pretty formatting for displaying a vector of numbers in a log. + + @param f. An open file stream to write to. + @param label. A description of the numbers (e.g. "recall"). + @param vec. A numpy array of the numbers to display. + ''' + COLUMNS = 7 + start = 0 + write(f, '\t%-10s: ' % label) + if type(vec) != type([]): + vec = vec.tolist() + for row in range(int(math.ceil(float(len(vec))/COLUMNS))): + for featname in vec[start:start+COLUMNS]: + write(f, '%7.3f' % featname) + write(f, u'\n') + start += COLUMNS + + + +def print_str(f, label, names): + + ''' + print_str() + Pretty formatting for displaying a list of strings in a log + @param f. An open file stream to write to. + @param label. A description of the numbers (e.g. "recall"). + @param names. A list of strings. + ''' + COLUMNS = 4 + start = 0 + for row in range(int(math.ceil(float(len(names))/COLUMNS))): + if row == 0: + write(f, '\t%-10s: ' % label) + else: + write(f, '\t%-10s ' % '') + + for featname in names[start:start+COLUMNS]: + write(f, '%-16s ' % featname) + + write(f, u'\n') + start += COLUMNS + + + ############################################################# # Quick-and-Dirty evaluation of performance ############################################################# diff --git a/code/train.py b/code/train.py index 419051a..21e7f42 100644 --- a/code/train.py +++ b/code/train.py @@ -62,7 +62,7 @@ def main(): dest = "use_lstm", help = "Whether to use an LSTM model", action = 'store_true', - default = True + default = False ) parser.add_argument("--format", dest = "format", @@ -75,19 +75,23 @@ def main(): # Error check: Ensure that file paths are specified if not args.txt: + parser.print_help(sys.stderr) sys.stderr.write('\n\tError: Must provide text files\n') sys.stderr.write('\n') exit(1) if not args.con: + parser.print_help(sys.stderr) sys.stderr.write('\n\tError: Must provide annotations for text files\n') sys.stderr.write('\n') exit(1) if not args.model: + parser.print_help(sys.stderr) sys.stderr.write('\n\tError: Must provide valid path to store model\n') sys.stderr.write('\n') exit(1) modeldir = os.path.dirname(args.model) if (not os.path.exists(modeldir)) and (modeldir != ''): + parser.print_help(sys.stderr) sys.stderr.write('\n\tError: Model dir does not exist: %s\n' % modeldir) sys.stderr.write('\n') exit(1) @@ -187,9 +191,10 @@ def train(training_list, model_path, format, use_lstm, logfile=None, val=[], tes with open(model_path, "wb") as m_file: pickle.dump(model, m_file) - # model.log(logfile , model_file=model_path) - #model.log(sys.stdout, model_file=model_path) + model.log(logfile , model_file=model_path) + model.log(sys.stdout, model_file=model_path) + if __name__ == '__main__': main() diff --git a/requirements.txt b/requirements.txt index efc8a7a..9b83111 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -wheel nltk python-crfsuite numpy @@ -6,4 +5,3 @@ scipy scikit-learn marisa-trie repoze.lru -py4j