Skip to content

Commit

Permalink
Merge pull request #294 from usc-isi-i2/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
saggu authored Jun 7, 2018
2 parents ae80936 + 44ba689 commit c7ce18d
Show file tree
Hide file tree
Showing 8 changed files with 154 additions and 128 deletions.
92 changes: 61 additions & 31 deletions etk/core.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion etk/data_extractors/date_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def parse_date(str_date, ignore_future_dates=True, ignore_past_years=20, strict_
if len(str_date) > 100:
return None

str_date = str_date[:20] if len(str_date) > 20 else str_date
str_date = str_date[:25] if len(str_date) > 25 else str_date
str_date = str_date.replace('\r', '')
str_date = str_date.replace('\n', '')
str_date = str_date.replace('<', '')
Expand Down
40 changes: 20 additions & 20 deletions etk/unit_tests/ground_truth/default_spacy_output.jl

Large diffs are not rendered by default.

7 changes: 2 additions & 5 deletions etk/unit_tests/test_default_spacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,12 @@ def setUp(self):

def test_extraction_from_default_spacy(self):
c = Core(extraction_config=self.e_config, load_spacy=True)
dd = codecs.open('temp','w')
for i in range(len(self.ground_truth_input)):

r = c.process(self.ground_truth_input[
i], create_knowledge_graph=True, html_description=False)
# dd.write(json.dumps(r))
# dd.write('\n')
self.assertEquals(self.ground_truth_output[i][
'knowledge_graph'], r['knowledge_graph'])

self.assertEquals(self.ground_truth_output[i]['knowledge_graph'], r['knowledge_graph'])


if __name__ == '__main__':
Expand Down
16 changes: 8 additions & 8 deletions etk/unit_tests/test_extractions_using_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,16 +113,16 @@ def test_extractor_dictionary(self):
extraction = r["knowledge_graph"]["name"]
ex = [
{
"confidence": 1,
"confidence": 1.0,
"provenance": [
{
"source": {
"segment": "content_strict",
"context": {
"start": 10,
"end": 11,
"start": 7,
"end": 8,
"input": "tokens",
"text": "27 \n my name is <etk 'attribute' = 'name'>helena</etk> height 160cms weight 55 kilos "
"text": "chrissy391 27 my name is <etk 'attribute' = 'name'>helena</etk> height 160cms weight 55 kilos "
},
"document_id": "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
},
Expand All @@ -137,16 +137,16 @@ def test_extractor_dictionary(self):
"value": "helena"
},
{
"confidence": 1,
"confidence": 1.0,
"provenance": [
{
"source": {
"segment": "content_strict",
"context": {
"start": 136,
"end": 137,
"start": 111,
"end": 112,
"input": "tokens",
"text": "\n hey i ' m <etk 'attribute' = 'name'>luna</etk> 3234522013 let ' s explore "
"text": "girls hey i ' m <etk 'attribute' = 'name'>luna</etk> 3234522013 let ' s explore "
},
"document_id": "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
},
Expand Down
8 changes: 4 additions & 4 deletions etk/unit_tests/test_extractions_using_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,16 +89,16 @@ def test_extractor_regex(self):
self.assertTrue(len(r['knowledge_graph']["name"]) == 1)
extraction = r['knowledge_graph']["name"][0]
ex = {
"confidence": 1,
"confidence": 1.0,
"provenance": [
{
"source": {
"segment": "content_strict",
"context": {
"start": 41,
"end": 58,
"start": 68,
"end": 85,
"input": "text",
"text": "91 27 \n <etk 'attribute' = 'name'>My name is Helena</etk> height 16"
"text": "br/><br/> <etk 'attribute' = 'name'>My name is Helena</etk> height 16"
},
"document_id": "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
},
Expand Down
77 changes: 38 additions & 39 deletions etk/unit_tests/test_geonames_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,16 +169,16 @@ def test_geonames_lookup(self):
]"""
ex_city_name_gt = [
{
"confidence": 1,
"confidence": 1.0,
"provenance": [
{
"source": {
"segment": "content_strict",
"context": {
"start": 88,
"end": 90,
"start": 69,
"end": 71,
"input": "tokens",
"text": "' s location : \n <etk 'attribute' = 'city_name'>los angeles</etk> , california \n escort ' "
"text": "escort ' s location : <etk 'attribute' = 'city_name'>los angeles</etk> , california escort ' s "
},
"document_id": "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
},
Expand All @@ -192,10 +192,10 @@ def test_geonames_lookup(self):
"source": {
"segment": "content_relaxed",
"context": {
"start": 88,
"end": 90,
"start": 69,
"end": 71,
"input": "tokens",
"text": "' s location : \n <etk 'attribute' = 'city_name'>los angeles</etk> , california \n escort ' "
"text": "escort ' s location : <etk 'attribute' = 'city_name'>los angeles</etk> , california escort ' s "
},
"document_id": "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
},
Expand Down Expand Up @@ -243,14 +243,14 @@ def test_geonames_lookup(self):
]"""
ex_states_usa_codes_gt = [
{
"confidence": 1,
"confidence": 1.0,
"provenance": [
{
"source": {
"segment": "content_strict",
"context": {
"start": 17,
"end": 18,
"start": 14,
"end": 15,
"input": "tokens",
"text": "160cms weight 55 kilos contact <etk 'attribute' = 'states_usa_codes'>me</etk> at escort . here @ "
},
Expand All @@ -266,8 +266,8 @@ def test_geonames_lookup(self):
"source": {
"segment": "content_strict",
"context": {
"start": 193,
"end": 194,
"start": 168,
"end": 169,
"input": "tokens",
"text": "- 452 - 2013 . <etk 'attribute' = 'states_usa_codes'>me</etk> and my friends are on "
},
Expand All @@ -283,10 +283,10 @@ def test_geonames_lookup(self):
"source": {
"segment": "content_strict",
"context": {
"start": 217,
"end": 218,
"start": 191,
"end": 192,
"input": "tokens",
"text": ". . skittlegirl \n\n\n\n call <etk 'attribute' = 'states_usa_codes'>me</etk> on my cell at 323 "
"text": ". . . skittlegirl call <etk 'attribute' = 'states_usa_codes'>me</etk> on my cell at 323 "
},
"document_id": "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
},
Expand All @@ -300,8 +300,8 @@ def test_geonames_lookup(self):
"source": {
"segment": "content_relaxed",
"context": {
"start": 17,
"end": 18,
"start": 14,
"end": 15,
"input": "tokens",
"text": "160cms weight 55 kilos contact <etk 'attribute' = 'states_usa_codes'>me</etk> at escort . here @ "
},
Expand All @@ -317,8 +317,8 @@ def test_geonames_lookup(self):
"source": {
"segment": "content_relaxed",
"context": {
"start": 193,
"end": 194,
"start": 168,
"end": 169,
"input": "tokens",
"text": "- 452 - 2013 . <etk 'attribute' = 'states_usa_codes'>me</etk> and my friends are on "
},
Expand All @@ -334,10 +334,10 @@ def test_geonames_lookup(self):
"source": {
"segment": "content_relaxed",
"context": {
"start": 217,
"end": 218,
"start": 191,
"end": 192,
"input": "tokens",
"text": ". . skittlegirl \n\n\n\n call <etk 'attribute' = 'states_usa_codes'>me</etk> on my cell at 323 "
"text": ". . . skittlegirl call <etk 'attribute' = 'states_usa_codes'>me</etk> on my cell at 323 "
},
"document_id": "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
},
Expand All @@ -352,16 +352,16 @@ def test_geonames_lookup(self):
"value": "me"
},
{
"confidence": 1,
"confidence": 1.0,
"provenance": [
{
"source": {
"segment": "content_strict",
"context": {
"start": 126,
"end": 127,
"start": 103,
"end": 104,
"input": "tokens",
"text": "there are 50 girls looking <etk 'attribute' = 'states_usa_codes'>in</etk> . \n view girls \n "
"text": "there are 50 girls looking <etk 'attribute' = 'states_usa_codes'>in</etk> . view girls hey i "
},
"document_id": "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
},
Expand All @@ -375,8 +375,8 @@ def test_geonames_lookup(self):
"source": {
"segment": "content_strict",
"context": {
"start": 146,
"end": 147,
"start": 121,
"end": 122,
"input": "tokens",
"text": "explore , embrace and indulge <etk 'attribute' = 'states_usa_codes'>in</etk> your favorite fantasy % independent "
},
Expand All @@ -392,10 +392,10 @@ def test_geonames_lookup(self):
"source": {
"segment": "content_relaxed",
"context": {
"start": 126,
"end": 127,
"start": 103,
"end": 104,
"input": "tokens",
"text": "there are 50 girls looking <etk 'attribute' = 'states_usa_codes'>in</etk> . \n view girls \n "
"text": "there are 50 girls looking <etk 'attribute' = 'states_usa_codes'>in</etk> . view girls hey i "
},
"document_id": "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
},
Expand All @@ -409,8 +409,8 @@ def test_geonames_lookup(self):
"source": {
"segment": "content_relaxed",
"context": {
"start": 146,
"end": 147,
"start": 121,
"end": 122,
"input": "tokens",
"text": "explore , embrace and indulge <etk 'attribute' = 'states_usa_codes'>in</etk> your favorite fantasy % independent "
},
Expand All @@ -427,14 +427,14 @@ def test_geonames_lookup(self):
"value": "in"
},
{
"confidence": 1,
"confidence": 1.0,
"provenance": [
{
"source": {
"segment": "content_strict",
"context": {
"start": 173,
"end": 174,
"start": 148,
"end": 149,
"input": "tokens",
"text": "exactly what you deserve call <etk 'attribute' = 'states_usa_codes'>or</etk> text fetish friendly fantasy friendly "
},
Expand All @@ -450,8 +450,8 @@ def test_geonames_lookup(self):
"source": {
"segment": "content_relaxed",
"context": {
"start": 173,
"end": 174,
"start": 148,
"end": 149,
"input": "tokens",
"text": "exactly what you deserve call <etk 'attribute' = 'states_usa_codes'>or</etk> text fetish friendly fantasy friendly "
},
Expand All @@ -475,7 +475,6 @@ def test_geonames_lookup(self):
self.assertEqual(r['knowledge_graph']['city_name'], ex_city_name)

self.assertEqual(r['knowledge_graph']['city'], json.loads(ex_city_gt))

pop_states_usa_codes = json.loads(json.JSONEncoder().encode(r['knowledge_graph']['states_usa_codes']))
self.assertEqual(len(pop_states_usa_codes), len(ex_states_usa_codes_gt))
self.assertEqual(pop_states_usa_codes, ex_states_usa_codes_gt)
Expand Down
Loading

0 comments on commit c7ce18d

Please sign in to comment.