Merge branch 'development' of https://github.com/usc-isi-i2/etk into …

…development
usc-isi-i2 · Jan 21, 2018 · 184bc8c · 184bc8c
2 parents 806a429 + bd6ac53
commit 184bc8c
Show file tree

Hide file tree

Showing 9 changed files with 446 additions and 70 deletions.
diff --git a/etk/core.py b/etk/core.py
@@ -130,6 +130,7 @@
 _CREATE_KG_NODE_EXTRACTOR = "create_kg_node_extractor"
 _ADD_CONSTANT_KG  = "add_constant_kg"
 _GUARD = "guard"
+_GUARDS = "guards"
 _STOP_VALUE = 'stop_value'
 _MATCH = "match"
 _CONTANTS = "constants"
@@ -438,7 +439,8 @@ def process(self, doc, create_knowledge_graph=True, html_description=False):
 
                         if not isinstance(input_paths, list):
                             input_paths = [input_paths]
-
+                        # get user defined guards
+                        guards = de_config[_GUARDS] if _GUARDS in de_config else None
                         for input_path in input_paths:
                             if _FIELDS in de_config:
                                 if input_path not in self.data_extraction_path:
@@ -449,6 +451,8 @@ def process(self, doc, create_knowledge_graph=True, html_description=False):
                                             '\'{}\' is not a valid json path'.format(input_path))
                                 matches = self.data_extraction_path[input_path].find(doc)
                                 for match in matches:
+                                    if guards and not self.assert_data_extraction_guard(guards, doc, match.value):
+                                        continue
                                     # First rule of DATA Extraction club: Get tokens
                                     # Get the crf tokens
                                     if _TEXT in match.value:
@@ -823,7 +827,7 @@ def extract_as_is(self, d, config=None):
             result = self.pseudo_extraction_results(d)
             return result
 
-        if isinstance(d, dict) and _TEXT in d:
+        elif isinstance(d, dict) and _TEXT in d:
             if d[_TEXT].strip() != '':
                 result = self.pseudo_extraction_results(d[_TEXT], key=d[_KEY] if _KEY in d else None,
                                                         qualifiers=d[_QUALIFIERS] if _QUALIFIERS in d else None)
@@ -835,7 +839,7 @@ def extract_as_is(self, d, config=None):
                 return None
 
         # this is the case where we are going to put the input object to a field called 'data'
-        if isinstance(d, dict) or isinstance(d, list):
+        elif isinstance(d, dict) or isinstance(d, list):
             str_d = json.dumps(d, sort_keys=True)
             key = hashlib.sha256(str_d).hexdigest().upper()
             result = self.pseudo_extraction_results(str_d, key=key)
@@ -1256,6 +1260,29 @@ def run_table_extractor(self, content_extraction, html, table_config):
                 content_extraction[field_name] = tables
         return content_extraction
 
+    def assert_data_extraction_guard(self, guards, doc, json_path_result):
+        # print 'processing guards: {}'.format(guards)
+        for guard in guards:
+            try:
+                jpath_parser = parse(guard['path'])
+                regex = guard['regex']
+                if guard['type'] == 'doc':
+                    matches = jpath_parser.find(doc)
+                elif guard['type'] == 'path':
+                    matches = jpath_parser.find(json_path_result)
+                else:
+                    print 'guard type "{}" is not a valid type.'.format(guard['type'])
+                    continue
+                if len(matches) == 0:
+                    return False
+                for match in matches:
+                    # print match.value, regex
+                    if not re.match(regex, match.value):
+                        return False
+            except Exception as e:
+                print 'could not apply guard: {}'.format(guard)
+        return True
+
     def run_readability(self, content_extraction, html, re_extractor):
         recall_priority = False
         field_name = None
@@ -2287,12 +2314,13 @@ def create_kg_node_extractor(ds, config, doc, parent_doc_id, doc_id=None, url=No
             result = dict()
             result['@timestamp_created'] = timestamp_created
 
-            # result[_PARENT_DOC_ID] = parent_doc_id
+            result[_PARENT_DOC_ID] = parent_doc_id
             result[_CREATED_BY] = 'etk'
             if url:
                 result[_URL] = url
 
             result[_CONTENT_EXTRACTION] = dict()
+            result[_RAW_CONTENT] = str(d)
 
             if not doc_id:
                 if isinstance(d, basestring) or isinstance(d, numbers.Number):
@@ -2322,4 +2350,4 @@ def create_kg_node_extractor(ds, config, doc, parent_doc_id, doc_id=None, url=No
 
             doc['nested_docs'].append(result)
             extractions.append({'value': doc_id, 'metadata': {'timestamp_created': timestamp_created}})
-        return extractions
+        return extractions
diff --git a/etk/data_extractors/table_extractor.py b/etk/data_extractors/table_extractor.py
@@ -64,7 +64,7 @@ def wrap_context(self, text):
     def extract(self, table, dic):
         # print dic
         if table['features']['max_cols_in_a_row'] != 2 and table['features']['no_of_rows'] < 2:
-            return None
+            return []
         res = []
         for row in table['rows']:
             if len(row['cells']) != 2:
@@ -202,7 +202,7 @@ def extract(self, html_doc, min_data_rows = 1):
                 for index_row, row in enumerate(rows):
                     row_dict = dict()
                     soup_row = BeautifulSoup(row, 'html.parser')
-                    row_data = ''.join(soup_row.stripped_strings)
+                    row_data = ' '.join(soup_row.stripped_strings)
                     row_data = row_data.replace("\\t", "").replace("\\r", "").replace("\\n", "")
                     if row_data != '':
                         row_len_list.append(len(row_data))
@@ -224,15 +224,15 @@ def extract(self, html_doc, min_data_rows = 1):
                             cell_dict = dict()
                             cell_dict["cell"] = str(td)
                             # cell_dict["text"] = [{"result": {"value": ''.join(td.stripped_strings)}}]
-                            cell_dict["text"] = ''.join(td.stripped_strings)
+                            cell_dict["text"] = ' '.join(td.stripped_strings)
                             # cell_dict["id"] = 'row_{0}_col_{1}'.format(index_row, index_col)
                             avg_cell_len += len(cell_dict["text"])
                             cell_list.append(cell_dict)
                         for index_col, td in enumerate(soup_row.findAll('td')):
                             cell_dict = dict()
                             cell_dict["cell"] = str(td)
                             # cell_dict["text"] = [{"result": {"value": ''.join(td.stripped_strings)}}]
-                            cell_dict["text"] = ''.join(td.stripped_strings)
+                            cell_dict["text"] = ' '.join(td.stripped_strings)
                             # cell_dict["id"] = 'row_{0}_col_{1}'.format(index_row, index_col)
                             avg_cell_len += len(cell_dict["text"])
                             cell_list.append(cell_dict)
@@ -272,7 +272,7 @@ def extract(self, html_doc, min_data_rows = 1):
                         h_index = 0
                         h_bool = True
                         for col in row.findAll('th'):
-                            col_content = ''.join(col.stripped_strings)
+                            col_content = ' '.join(col.stripped_strings)
                             h_bool = False
                             if col_content is None:
                                 continue
@@ -283,7 +283,7 @@ def extract(self, html_doc, min_data_rows = 1):
                         if(h_index == 1 and h_bool == False):
                             d_index = 1
                         for col in row.findAll('td'):
-                            col_content = ''.join(col.stripped_strings)
+                            col_content = ' '.join(col.stripped_strings)
                             if col_content is None:
                                 d_index += 1
                                 continue
@@ -292,7 +292,7 @@ def extract(self, html_doc, min_data_rows = 1):
                             d_index += 1
 
                     for key, value in col_data.iteritems():
-                        whole_col = ''.join(value)
+                        whole_col = ' '.join(value)
                         # avg_cell_len += float("%.2f" % mean([len(x) for x in value]))
                         avg_col_len += sum([len(x) for x in value])
                         avg_col_len_dev += TableExtraction.pstdev([len(x) for x in value])
@@ -328,6 +328,23 @@ def create_fingerprint(table):
         fingerprint = '-'.join(all_tokens)
         return fingerprint
 
+    @staticmethod
+    def row_to_text(cells):
+        res = ''
+        for c in cells:
+            res += c['text'] + ' | '
+        return res
+
+    @staticmethod
+    def table_to_text(rows):
+        res = ''
+        for row in rows:
+            for c in row['cells']:
+                res += c['text'] + ' | '
+            res += '\n'
+        return res
+
+
     @staticmethod
     def gen_html(row_list):
         """ Return html table string from a list of data rows """

diff --git a/etk/resources/extraction_config_table_content.json b/etk/resources/extraction_config_table_content.json
@@ -9,7 +9,21 @@
     "landmark": [
     ],
     "pickle": {
-      "my_dic": "/Users/majid/DIG/test_fields.txt"
+      "calibre_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/calibre_dict.txt",
+      "capacity_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/capacity_dict.txt",
+      "length_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/length_dict.txt",
+      "finish_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/finish_dict.txt",
+      "manufacturer_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/manufacturer_dict.txt",
+      "weight_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/weight_dict.txt",
+      "seller_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/seller_dict.txt",
+      "id_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/id_dict.txt",
+      "action_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/action_dict.txt",
+      "price_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/price_dict.txt",
+      "model_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/model_dict.txt",
+      "condition_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/condition_dict.txt",
+      "description_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/description_dict.txt",
+      "address_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/address_dict.txt",
+      "country_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/country_dict.txt"
     }
   },
   "content_extraction": {
@@ -29,11 +43,137 @@
         "*.table.tables[*]"
       ],
       "fields": {
-        "my_field": {
+        "calibre": {
           "extractors": {
             "entity_table_extractor": {
               "config": {
-                "dic": "my_dic"
+                "dic": "calibre_dict"
+              }
+            }
+          }
+        },
+        "capacity": {
+          "extractors": {
+            "entity_table_extractor": {
+              "config": {
+                "dic": "capacity_dict"
+              }
+            }
+          }
+        },
+        "length": {
+          "extractors": {
+            "entity_table_extractor": {
+              "config": {
+                "dic": "length_dict"
+              }
+            }
+          }
+        },
+        "finish": {
+          "extractors": {
+            "entity_table_extractor": {
+              "config": {
+                "dic": "finish_dict"
+              }
+            }
+          }
+        },
+        "manufacturer": {
+          "extractors": {
+            "entity_table_extractor": {
+              "config": {
+                "dic": "manufacturer_dict"
+              }
+            }
+          }
+        },
+        "weight": {
+          "extractors": {
+            "entity_table_extractor": {
+              "config": {
+                "dic": "weight_dict"
+              }
+            }
+          }
+        },
+        "seller": {
+          "extractors": {
+            "entity_table_extractor": {
+              "config": {
+                "dic": "seller_dict"
+              }
+            }
+          }
+        },
+        "id": {
+          "extractors": {
+            "entity_table_extractor": {
+              "config": {
+                "dic": "id_dict"
+              }
+            }
+          }
+        },
+        "action": {
+          "extractors": {
+            "entity_table_extractor": {
+              "config": {
+                "dic": "action_dict"
+              }
+            }
+          }
+        },
+        "price": {
+          "extractors": {
+            "entity_table_extractor": {
+              "config": {
+                "dic": "price_dict"
+              }
+            }
+          }
+        },
+        "model": {
+          "extractors": {
+            "entity_table_extractor": {
+              "config": {
+                "dic": "model_dict"
+              }
+            }
+          }
+        },
+        "condition": {
+          "extractors": {
+            "entity_table_extractor": {
+              "config": {
+                "dic": "condition_dict"
+              }
+            }
+          }
+        },
+        "description": {
+          "extractors": {
+            "entity_table_extractor": {
+              "config": {
+                "dic": "description_dict"
+              }
+            }
+          }
+        },
+        "address": {
+          "extractors": {
+            "entity_table_extractor": {
+              "config": {
+                "dic": "address_dict"
+              }
+            }
+          }
+        },
+        "country": {
+          "extractors": {
+            "entity_table_extractor": {
+              "config": {
+                "dic": "country_dict"
               }
             }
           }

diff --git a/etk/unit_tests/ground_truth/nested_doc.jl b/etk/unit_tests/ground_truth/nested_doc.jl
diff --git a/etk/unit_tests/ground_truth/nested_doc.out.jl b/etk/unit_tests/ground_truth/nested_doc.out.jl