Merge pull request #294 from usc-isi-i2/development

Development
usc-isi-i2 · Jun 7, 2018 · c7ce18d · c7ce18d
2 parents ae80936 + 44ba689
commit c7ce18d
Show file tree

Hide file tree

Showing 8 changed files with 154 additions and 128 deletions.
diff --git a/etk/core.py b/etk/core.py
diff --git a/etk/data_extractors/date_parser.py b/etk/data_extractors/date_parser.py
@@ -7,7 +7,7 @@ def parse_date(str_date, ignore_future_dates=True, ignore_past_years=20, strict_
         if len(str_date) > 100:
             return None
 
-        str_date = str_date[:20] if len(str_date) > 20 else str_date
+        str_date = str_date[:25] if len(str_date) > 25 else str_date
         str_date = str_date.replace('\r', '')
         str_date = str_date.replace('\n', '')
         str_date = str_date.replace('<', '')

diff --git a/etk/unit_tests/ground_truth/default_spacy_output.jl b/etk/unit_tests/ground_truth/default_spacy_output.jl
diff --git a/etk/unit_tests/test_default_spacy.py b/etk/unit_tests/test_default_spacy.py
@@ -26,15 +26,12 @@ def setUp(self):
 
     def test_extraction_from_default_spacy(self):
         c = Core(extraction_config=self.e_config, load_spacy=True)
-        dd = codecs.open('temp','w')
         for i in range(len(self.ground_truth_input)):
 
             r = c.process(self.ground_truth_input[
                           i], create_knowledge_graph=True, html_description=False)
-            # dd.write(json.dumps(r))
-            # dd.write('\n')
-            self.assertEquals(self.ground_truth_output[i][
-                              'knowledge_graph'], r['knowledge_graph'])
+
+            self.assertEquals(self.ground_truth_output[i]['knowledge_graph'], r['knowledge_graph'])
 
 
 if __name__ == '__main__':

diff --git a/etk/unit_tests/test_extractions_using_dictionary.py b/etk/unit_tests/test_extractions_using_dictionary.py
@@ -113,16 +113,16 @@ def test_extractor_dictionary(self):
         extraction = r["knowledge_graph"]["name"]
         ex = [
             {
-                "confidence": 1,
+                "confidence": 1.0,
                 "provenance": [
                     {
                         "source": {
                             "segment": "content_strict",
                             "context": {
-                                "start": 10,
-                                "end": 11,
+                                "start": 7,
+                                "end": 8,
                                 "input": "tokens",
-                                "text": "27 \n my name is <etk 'attribute' = 'name'>helena</etk> height 160cms weight 55 kilos "
+                                "text": "chrissy391 27 my name is <etk 'attribute' = 'name'>helena</etk> height 160cms weight 55 kilos "
                             },
                             "document_id": "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
                         },
@@ -137,16 +137,16 @@ def test_extractor_dictionary(self):
                 "value": "helena"
             },
             {
-                "confidence": 1,
+                "confidence": 1.0,
                 "provenance": [
                     {
                         "source": {
                             "segment": "content_strict",
                             "context": {
-                                "start": 136,
-                                "end": 137,
+                                "start": 111,
+                                "end": 112,
                                 "input": "tokens",
-                                "text": "\n hey i ' m <etk 'attribute' = 'name'>luna</etk> 3234522013 let ' s explore "
+                                "text": "girls hey i ' m <etk 'attribute' = 'name'>luna</etk> 3234522013 let ' s explore "
                             },
                             "document_id": "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
                         },

diff --git a/etk/unit_tests/test_extractions_using_regex.py b/etk/unit_tests/test_extractions_using_regex.py
@@ -89,16 +89,16 @@ def test_extractor_regex(self):
         self.assertTrue(len(r['knowledge_graph']["name"]) == 1)
         extraction = r['knowledge_graph']["name"][0]
         ex = {
-            "confidence": 1,
+            "confidence": 1.0,
             "provenance": [
                 {
                     "source": {
                         "segment": "content_strict",
                         "context": {
-                            "start": 41,
-                            "end": 58,
+                            "start": 68,
+                            "end": 85,
                             "input": "text",
-                            "text": "91  27  \n  <etk 'attribute' = 'name'>My name is Helena</etk>  height 16"
+                            "text": "br/><br/>  <etk 'attribute' = 'name'>My name is Helena</etk>  height 16"
                         },
                         "document_id": "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
                     },

diff --git a/etk/unit_tests/test_geonames_lookup.py b/etk/unit_tests/test_geonames_lookup.py
@@ -169,16 +169,16 @@ def test_geonames_lookup(self):
                     ]"""
         ex_city_name_gt = [
             {
-                "confidence": 1,
+                "confidence": 1.0,
                 "provenance": [
                     {
                         "source": {
                             "segment": "content_strict",
                             "context": {
-                                "start": 88,
-                                "end": 90,
+                                "start": 69,
+                                "end": 71,
                                 "input": "tokens",
-                                "text": "' s location : \n <etk 'attribute' = 'city_name'>los angeles</etk> , california \n escort ' "
+                                "text": "escort ' s location : <etk 'attribute' = 'city_name'>los angeles</etk> , california escort ' s "
                             },
                             "document_id": "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
                         },
@@ -192,10 +192,10 @@ def test_geonames_lookup(self):
                         "source": {
                             "segment": "content_relaxed",
                             "context": {
-                                "start": 88,
-                                "end": 90,
+                                "start": 69,
+                                "end": 71,
                                 "input": "tokens",
-                                "text": "' s location : \n <etk 'attribute' = 'city_name'>los angeles</etk> , california \n escort ' "
+                                "text": "escort ' s location : <etk 'attribute' = 'city_name'>los angeles</etk> , california escort ' s "
                             },
                             "document_id": "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
                         },
@@ -243,14 +243,14 @@ def test_geonames_lookup(self):
             ]"""
         ex_states_usa_codes_gt = [
             {
-                "confidence": 1,
+                "confidence": 1.0,
                 "provenance": [
                     {
                         "source": {
                             "segment": "content_strict",
                             "context": {
-                                "start": 17,
-                                "end": 18,
+                                "start": 14,
+                                "end": 15,
                                 "input": "tokens",
                                 "text": "160cms weight 55 kilos contact <etk 'attribute' = 'states_usa_codes'>me</etk> at escort . here @ "
                             },
@@ -266,8 +266,8 @@ def test_geonames_lookup(self):
                         "source": {
                             "segment": "content_strict",
                             "context": {
-                                "start": 193,
-                                "end": 194,
+                                "start": 168,
+                                "end": 169,
                                 "input": "tokens",
                                 "text": "- 452 - 2013 . <etk 'attribute' = 'states_usa_codes'>me</etk> and my friends are on "
                             },
@@ -283,10 +283,10 @@ def test_geonames_lookup(self):
                         "source": {
                             "segment": "content_strict",
                             "context": {
-                                "start": 217,
-                                "end": 218,
+                                "start": 191,
+                                "end": 192,
                                 "input": "tokens",
-                                "text": ". . skittlegirl \n\n\n\n call <etk 'attribute' = 'states_usa_codes'>me</etk> on my cell at 323 "
+                                "text": ". . . skittlegirl call <etk 'attribute' = 'states_usa_codes'>me</etk> on my cell at 323 "
                             },
                             "document_id": "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
                         },
@@ -300,8 +300,8 @@ def test_geonames_lookup(self):
                         "source": {
                             "segment": "content_relaxed",
                             "context": {
-                                "start": 17,
-                                "end": 18,
+                                "start": 14,
+                                "end": 15,
                                 "input": "tokens",
                                 "text": "160cms weight 55 kilos contact <etk 'attribute' = 'states_usa_codes'>me</etk> at escort . here @ "
                             },
@@ -317,8 +317,8 @@ def test_geonames_lookup(self):
                         "source": {
                             "segment": "content_relaxed",
                             "context": {
-                                "start": 193,
-                                "end": 194,
+                                "start": 168,
+                                "end": 169,
                                 "input": "tokens",
                                 "text": "- 452 - 2013 . <etk 'attribute' = 'states_usa_codes'>me</etk> and my friends are on "
                             },
@@ -334,10 +334,10 @@ def test_geonames_lookup(self):
                         "source": {
                             "segment": "content_relaxed",
                             "context": {
-                                "start": 217,
-                                "end": 218,
+                                "start": 191,
+                                "end": 192,
                                 "input": "tokens",
-                                "text": ". . skittlegirl \n\n\n\n call <etk 'attribute' = 'states_usa_codes'>me</etk> on my cell at 323 "
+                                "text": ". . . skittlegirl call <etk 'attribute' = 'states_usa_codes'>me</etk> on my cell at 323 "
                             },
                             "document_id": "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
                         },
@@ -352,16 +352,16 @@ def test_geonames_lookup(self):
                 "value": "me"
             },
             {
-                "confidence": 1,
+                "confidence": 1.0,
                 "provenance": [
                     {
                         "source": {
                             "segment": "content_strict",
                             "context": {
-                                "start": 126,
-                                "end": 127,
+                                "start": 103,
+                                "end": 104,
                                 "input": "tokens",
-                                "text": "there are 50 girls looking <etk 'attribute' = 'states_usa_codes'>in</etk> . \n view girls \n "
+                                "text": "there are 50 girls looking <etk 'attribute' = 'states_usa_codes'>in</etk> . view girls hey i "
                             },
                             "document_id": "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
                         },
@@ -375,8 +375,8 @@ def test_geonames_lookup(self):
                         "source": {
                             "segment": "content_strict",
                             "context": {
-                                "start": 146,
-                                "end": 147,
+                                "start": 121,
+                                "end": 122,
                                 "input": "tokens",
                                 "text": "explore , embrace and indulge <etk 'attribute' = 'states_usa_codes'>in</etk> your favorite fantasy % independent "
                             },
@@ -392,10 +392,10 @@ def test_geonames_lookup(self):
                         "source": {
                             "segment": "content_relaxed",
                             "context": {
-                                "start": 126,
-                                "end": 127,
+                                "start": 103,
+                                "end": 104,
                                 "input": "tokens",
-                                "text": "there are 50 girls looking <etk 'attribute' = 'states_usa_codes'>in</etk> . \n view girls \n "
+                                "text": "there are 50 girls looking <etk 'attribute' = 'states_usa_codes'>in</etk> . view girls hey i "
                             },
                             "document_id": "1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21"
                         },
@@ -409,8 +409,8 @@ def test_geonames_lookup(self):
                         "source": {
                             "segment": "content_relaxed",
                             "context": {
-                                "start": 146,
-                                "end": 147,
+                                "start": 121,
+                                "end": 122,
                                 "input": "tokens",
                                 "text": "explore , embrace and indulge <etk 'attribute' = 'states_usa_codes'>in</etk> your favorite fantasy % independent "
                             },
@@ -427,14 +427,14 @@ def test_geonames_lookup(self):
                 "value": "in"
             },
             {
-                "confidence": 1,
+                "confidence": 1.0,
                 "provenance": [
                     {
                         "source": {
                             "segment": "content_strict",
                             "context": {
-                                "start": 173,
-                                "end": 174,
+                                "start": 148,
+                                "end": 149,
                                 "input": "tokens",
                                 "text": "exactly what you deserve call <etk 'attribute' = 'states_usa_codes'>or</etk> text fetish friendly fantasy friendly "
                             },
@@ -450,8 +450,8 @@ def test_geonames_lookup(self):
                         "source": {
                             "segment": "content_relaxed",
                             "context": {
-                                "start": 173,
-                                "end": 174,
+                                "start": 148,
+                                "end": 149,
                                 "input": "tokens",
                                 "text": "exactly what you deserve call <etk 'attribute' = 'states_usa_codes'>or</etk> text fetish friendly fantasy friendly "
                             },
@@ -475,7 +475,6 @@ def test_geonames_lookup(self):
         self.assertEqual(r['knowledge_graph']['city_name'], ex_city_name)
 
         self.assertEqual(r['knowledge_graph']['city'], json.loads(ex_city_gt))
-
         pop_states_usa_codes = json.loads(json.JSONEncoder().encode(r['knowledge_graph']['states_usa_codes']))
         self.assertEqual(len(pop_states_usa_codes), len(ex_states_usa_codes_gt))
         self.assertEqual(pop_states_usa_codes, ex_states_usa_codes_gt)