From e5282d0a3ddb2c67b66002d7c6d55f07122ecd15 Mon Sep 17 00:00:00 2001 From: wboag Date: Tue, 10 Apr 2018 20:52:37 -0400 Subject: [PATCH] fixed genia feature extraction to work for python3 --- code/feature_extraction/features.py | 2 +- .../genia_dir/genia_cache.py | 9 ++-- .../genia_dir/genia_features.py | 28 ++++++++++--- .../genia_dir/interface_genia.py | 42 ++++++++++++++----- 4 files changed, 59 insertions(+), 22 deletions(-) diff --git a/code/feature_extraction/features.py b/code/feature_extraction/features.py index e96b314..9fad852 100644 --- a/code/feature_extraction/features.py +++ b/code/feature_extraction/features.py @@ -25,7 +25,7 @@ # Import feature modules feat_genia=None if enabled['GENIA']: - from genia_dir.genia_features import GeniaFeatures + from .genia_dir.genia_features import GeniaFeatures # Only create UMLS cache if module is available diff --git a/code/feature_extraction/genia_dir/genia_cache.py b/code/feature_extraction/genia_dir/genia_cache.py index e39de5f..27bfc1b 100644 --- a/code/feature_extraction/genia_dir/genia_cache.py +++ b/code/feature_extraction/genia_dir/genia_cache.py @@ -1,4 +1,4 @@ -import cPickle as pickle +import pickle import os import sys @@ -6,7 +6,7 @@ if parentdir not in sys.path: sys.path.append(parentdir) -from utilities import load_pickled_obj +from .. import utils class GeniaCache: @@ -14,12 +14,12 @@ def __init__(self): try: prefix = os.path.dirname(__file__) self.filename = os.path.join( prefix, 'genia_cache' ) - self.cache = load_pickled_obj(self.filename) + self.cache = utils.load_pickled_obj(self.filename) except IOError: self.cache = {} def has_key(self, key): - return self.cache.has_key( str(key) ) + return str(key) in self.cache def add_map(self, key, value): self.cache[ str(key) ] = value @@ -29,4 +29,3 @@ def get_map(self, key): def __del__(self): pickle.dump( self.cache, open( self.filename, "wb" ) ) - diff --git a/code/feature_extraction/genia_dir/genia_features.py b/code/feature_extraction/genia_dir/genia_features.py index 9a5b877..4d96adc 100644 --- a/code/feature_extraction/genia_dir/genia_features.py +++ b/code/feature_extraction/genia_dir/genia_features.py @@ -8,8 +8,8 @@ -import interface_genia -from .. import utilities +from . import interface_genia +from .. import utils class GeniaFeatures: @@ -24,10 +24,16 @@ def __init__(self, tagger, data): data = [ [w for w in sent if w!=''] for sent in data] # Filter out nonprose sentences - prose = [ sent for sent in data if utilities.is_prose_sentence(sent) ] + prose = [ sent for sent in data if utils.is_prose_sentence(sent) ] # Process prose sentences with GENIA tagger - self.GENIA_features = iter(interface_genia.genia(tagger, prose)) + #self.GENIA_features = iter(interface_genia.genia(tagger, prose)) + self.gfeatures = {} + gf = interface_genia.genia(tagger, prose) + for sent,feats in zip(prose, gf): + key = '%'.join(sent) + self.gfeatures[key] = feats + #self.GENIA_features = iter(interface_genia.genia(tagger, prose)) @@ -47,7 +53,7 @@ def features(self, sentence): sentence = [w for w in sentence if w!=''] # Mechanism to allow for skipping nonprose - if not utilities.is_prose_sentence(sentence): return [] + if not utils.is_prose_sentence(sentence): return [] # Return value is a list of dictionaries (of features) features_list = [ {} for _ in sentence ] @@ -56,7 +62,9 @@ def features(self, sentence): #print 'len(sentence): ', len(sentence) # Get the GENIA features of the current sentence - genia_feats = next( self.GENIA_features ) + #genia_feats = next( self.GENIA_features ) + key = '%'.join(sentence) + genia_feats = self.gfeatures[key] ''' print [ c['GENIA-word'] for c in genia_feats] @@ -64,6 +72,14 @@ def features(self, sentence): print ''' + #print('\n\n\n') + #print(len(sentence), len(genia_feats)) + for i in range(len(sentence)): + #print(i) + #print(sentence[i]) + #print(genia_feats[i]) + #print() + assert len(sentence[i]) == len(genia_feats[i]['GENIA-word']) #print 'genia_feats: ', [ f['GENIA-word'] for f in genia_feats ] #print 'len(genia_feats): ', len(genia_feats) assert len(sentence) == len(genia_feats) diff --git a/code/feature_extraction/genia_dir/interface_genia.py b/code/feature_extraction/genia_dir/interface_genia.py index ee653c9..241e8c9 100644 --- a/code/feature_extraction/genia_dir/interface_genia.py +++ b/code/feature_extraction/genia_dir/interface_genia.py @@ -18,8 +18,9 @@ import sys import tempfile -from commands import getstatusoutput -from genia_cache import GeniaCache +#from commands import getstatusoutput +from subprocess import Popen, PIPE +from .genia_cache import GeniaCache tmp_dir = '/tmp' @@ -45,7 +46,6 @@ def genia(geniatagger, data): if not cache.has_key(sent): uncached.append(sent) - if uncached: # write list to file and then feed it to GENIA genia_dir = os.path.dirname(geniatagger) @@ -56,11 +56,13 @@ def genia(geniatagger, data): for line in uncached: f.write(line + '\n') # Run genia tagger - print '\t\tRunning GENIA tagger' + print('\t\tRunning GENIA tagger') genia_dir = os.path.dirname(geniatagger) - stream = getstatusoutput('cd %s ; ./geniatagger -nt %s' %(genia_dir,out)) - - #print 'stream: ', stream + #stream = getstatusoutput('cd %s ; ./geniatagger -nt %s' %(genia_dir,out)) + p = Popen('cd %s ; ./geniatagger -nt %s' %(genia_dir,out),shell=True,stdout=PIPE,stderr=PIPE) + stream_b, err = p.communicate() + + stream = stream_b.decode('ascii') #print '\t\tFinished GENIA tagger' @@ -69,17 +71,23 @@ def genia(geniatagger, data): tagged = [] # if the sentence is too long genia outputs an error. - stream_lines = stream[1].split('\n') + stream_lines = stream.split('\n') + + #print('\n\n\n') + #print(stream_lines) + #print('\n\n\n') # get the line the warning might be on. - potential_warning = "" if len(stream_lines[4:5]) == 0 else stream_lines[4:5][0] + #potential_warning = "" if len(stream_lines[4:5]) == 0 else stream_lines[4:5][0] genia_stream = None - genia_stream = stream_lines[4:] + #genia_stream = stream_lines[4:] + genia_stream = stream_lines for tag in genia_stream: if tag.startswith('warning: the sentence seems to be too long'): + print('WARNING:', tag) continue if tag.split(): # Part of line @@ -90,7 +98,13 @@ def genia(geniatagger, data): # Add tagger output to cache for line,tags in zip(uncached,tagged): + #print(line) + for w,feat in zip(line.split(),tags): + #print('\t', w, feat.split('\t')[0]) + assert w == feat.split('\t')[0] + #print('\n\n\n') cache.add_map(line,tags) + #print('-'*80) # Remove temp file os.close(os_handle) @@ -99,6 +113,14 @@ def genia(geniatagger, data): os.remove(out) + for sent in data: + feats = cache.get_map(' '.join(sent)) + #print(sent) + for w,feat in zip(sent,feats): + #print('\t', w, feat.split('\t')[0]) + assert w == feat.split('\t')[0] + #print('\n\n\n') + #exit() # Extract features linefeats = []