Skip to content
This repository has been archived by the owner on Aug 15, 2020. It is now read-only.

Commit

Permalink
fixed genia feature extraction to work for python3
Browse files Browse the repository at this point in the history
  • Loading branch information
wboag committed Apr 11, 2018
1 parent 7dc0410 commit e5282d0
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 22 deletions.
2 changes: 1 addition & 1 deletion code/feature_extraction/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# Import feature modules
feat_genia=None
if enabled['GENIA']:
from genia_dir.genia_features import GeniaFeatures
from .genia_dir.genia_features import GeniaFeatures


# Only create UMLS cache if module is available
Expand Down
9 changes: 4 additions & 5 deletions code/feature_extraction/genia_dir/genia_cache.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
import cPickle as pickle
import pickle
import os
import sys

parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if parentdir not in sys.path:
sys.path.append(parentdir)

from utilities import load_pickled_obj
from .. import utils


class GeniaCache:
def __init__(self):
try:
prefix = os.path.dirname(__file__)
self.filename = os.path.join( prefix, 'genia_cache' )
self.cache = load_pickled_obj(self.filename)
self.cache = utils.load_pickled_obj(self.filename)
except IOError:
self.cache = {}

def has_key(self, key):
return self.cache.has_key( str(key) )
return str(key) in self.cache

def add_map(self, key, value):
self.cache[ str(key) ] = value
Expand All @@ -29,4 +29,3 @@ def get_map(self, key):

def __del__(self):
pickle.dump( self.cache, open( self.filename, "wb" ) )

28 changes: 22 additions & 6 deletions code/feature_extraction/genia_dir/genia_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@



import interface_genia
from .. import utilities
from . import interface_genia
from .. import utils


class GeniaFeatures:
Expand All @@ -24,10 +24,16 @@ def __init__(self, tagger, data):
data = [ [w for w in sent if w!=''] for sent in data]

# Filter out nonprose sentences
prose = [ sent for sent in data if utilities.is_prose_sentence(sent) ]
prose = [ sent for sent in data if utils.is_prose_sentence(sent) ]

# Process prose sentences with GENIA tagger
self.GENIA_features = iter(interface_genia.genia(tagger, prose))
#self.GENIA_features = iter(interface_genia.genia(tagger, prose))
self.gfeatures = {}
gf = interface_genia.genia(tagger, prose)
for sent,feats in zip(prose, gf):
key = '%'.join(sent)
self.gfeatures[key] = feats
#self.GENIA_features = iter(interface_genia.genia(tagger, prose))



Expand All @@ -47,7 +53,7 @@ def features(self, sentence):
sentence = [w for w in sentence if w!='']

# Mechanism to allow for skipping nonprose
if not utilities.is_prose_sentence(sentence): return []
if not utils.is_prose_sentence(sentence): return []

# Return value is a list of dictionaries (of features)
features_list = [ {} for _ in sentence ]
Expand All @@ -56,14 +62,24 @@ def features(self, sentence):
#print 'len(sentence): ', len(sentence)

# Get the GENIA features of the current sentence
genia_feats = next( self.GENIA_features )
#genia_feats = next( self.GENIA_features )
key = '%'.join(sentence)
genia_feats = self.gfeatures[key]

'''
print [ c['GENIA-word'] for c in genia_feats]
print sentence
print
'''

#print('\n\n\n')
#print(len(sentence), len(genia_feats))
for i in range(len(sentence)):
#print(i)
#print(sentence[i])
#print(genia_feats[i])
#print()
assert len(sentence[i]) == len(genia_feats[i]['GENIA-word'])
#print 'genia_feats: ', [ f['GENIA-word'] for f in genia_feats ]
#print 'len(genia_feats): ', len(genia_feats)
assert len(sentence) == len(genia_feats)
Expand Down
42 changes: 32 additions & 10 deletions code/feature_extraction/genia_dir/interface_genia.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
import sys
import tempfile

from commands import getstatusoutput
from genia_cache import GeniaCache
#from commands import getstatusoutput
from subprocess import Popen, PIPE
from .genia_cache import GeniaCache

tmp_dir = '/tmp'

Expand All @@ -45,7 +46,6 @@ def genia(geniatagger, data):
if not cache.has_key(sent):
uncached.append(sent)


if uncached:
# write list to file and then feed it to GENIA
genia_dir = os.path.dirname(geniatagger)
Expand All @@ -56,11 +56,13 @@ def genia(geniatagger, data):
for line in uncached: f.write(line + '\n')

# Run genia tagger
print '\t\tRunning GENIA tagger'
print('\t\tRunning GENIA tagger')
genia_dir = os.path.dirname(geniatagger)
stream = getstatusoutput('cd %s ; ./geniatagger -nt %s' %(genia_dir,out))

#print 'stream: ', stream
#stream = getstatusoutput('cd %s ; ./geniatagger -nt %s' %(genia_dir,out))
p = Popen('cd %s ; ./geniatagger -nt %s' %(genia_dir,out),shell=True,stdout=PIPE,stderr=PIPE)
stream_b, err = p.communicate()

stream = stream_b.decode('ascii')

#print '\t\tFinished GENIA tagger'

Expand All @@ -69,17 +71,23 @@ def genia(geniatagger, data):
tagged = []

# if the sentence is too long genia outputs an error.
stream_lines = stream[1].split('\n')
stream_lines = stream.split('\n')

#print('\n\n\n')
#print(stream_lines)
#print('\n\n\n')

# get the line the warning might be on.
potential_warning = "" if len(stream_lines[4:5]) == 0 else stream_lines[4:5][0]
#potential_warning = "" if len(stream_lines[4:5]) == 0 else stream_lines[4:5][0]

genia_stream = None

genia_stream = stream_lines[4:]
#genia_stream = stream_lines[4:]
genia_stream = stream_lines

for tag in genia_stream:
if tag.startswith('warning: the sentence seems to be too long'):
print('WARNING:', tag)
continue

if tag.split(): # Part of line
Expand All @@ -90,7 +98,13 @@ def genia(geniatagger, data):

# Add tagger output to cache
for line,tags in zip(uncached,tagged):
#print(line)
for w,feat in zip(line.split(),tags):
#print('\t', w, feat.split('\t')[0])
assert w == feat.split('\t')[0]
#print('\n\n\n')
cache.add_map(line,tags)
#print('-'*80)

# Remove temp file
os.close(os_handle)
Expand All @@ -99,6 +113,14 @@ def genia(geniatagger, data):

os.remove(out)

for sent in data:
feats = cache.get_map(' '.join(sent))
#print(sent)
for w,feat in zip(sent,feats):
#print('\t', w, feat.split('\t')[0])
assert w == feat.split('\t')[0]
#print('\n\n\n')
#exit()

# Extract features
linefeats = []
Expand Down

0 comments on commit e5282d0

Please sign in to comment.