Skip to content

Commit

Permalink
Merge branch 'development' of https://github.com/usc-isi-i2/etk into …
Browse files Browse the repository at this point in the history
…development
  • Loading branch information
saggu committed Jan 21, 2018
2 parents 806a429 + bd6ac53 commit 184bc8c
Show file tree
Hide file tree
Showing 9 changed files with 446 additions and 70 deletions.
38 changes: 33 additions & 5 deletions etk/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@
_CREATE_KG_NODE_EXTRACTOR = "create_kg_node_extractor"
_ADD_CONSTANT_KG = "add_constant_kg"
_GUARD = "guard"
_GUARDS = "guards"
_STOP_VALUE = 'stop_value'
_MATCH = "match"
_CONTANTS = "constants"
Expand Down Expand Up @@ -438,7 +439,8 @@ def process(self, doc, create_knowledge_graph=True, html_description=False):

if not isinstance(input_paths, list):
input_paths = [input_paths]

# get user defined guards
guards = de_config[_GUARDS] if _GUARDS in de_config else None
for input_path in input_paths:
if _FIELDS in de_config:
if input_path not in self.data_extraction_path:
Expand All @@ -449,6 +451,8 @@ def process(self, doc, create_knowledge_graph=True, html_description=False):
'\'{}\' is not a valid json path'.format(input_path))
matches = self.data_extraction_path[input_path].find(doc)
for match in matches:
if guards and not self.assert_data_extraction_guard(guards, doc, match.value):
continue
# First rule of DATA Extraction club: Get tokens
# Get the crf tokens
if _TEXT in match.value:
Expand Down Expand Up @@ -823,7 +827,7 @@ def extract_as_is(self, d, config=None):
result = self.pseudo_extraction_results(d)
return result

if isinstance(d, dict) and _TEXT in d:
elif isinstance(d, dict) and _TEXT in d:
if d[_TEXT].strip() != '':
result = self.pseudo_extraction_results(d[_TEXT], key=d[_KEY] if _KEY in d else None,
qualifiers=d[_QUALIFIERS] if _QUALIFIERS in d else None)
Expand All @@ -835,7 +839,7 @@ def extract_as_is(self, d, config=None):
return None

# this is the case where we are going to put the input object to a field called 'data'
if isinstance(d, dict) or isinstance(d, list):
elif isinstance(d, dict) or isinstance(d, list):
str_d = json.dumps(d, sort_keys=True)
key = hashlib.sha256(str_d).hexdigest().upper()
result = self.pseudo_extraction_results(str_d, key=key)
Expand Down Expand Up @@ -1256,6 +1260,29 @@ def run_table_extractor(self, content_extraction, html, table_config):
content_extraction[field_name] = tables
return content_extraction

def assert_data_extraction_guard(self, guards, doc, json_path_result):
# print 'processing guards: {}'.format(guards)
for guard in guards:
try:
jpath_parser = parse(guard['path'])
regex = guard['regex']
if guard['type'] == 'doc':
matches = jpath_parser.find(doc)
elif guard['type'] == 'path':
matches = jpath_parser.find(json_path_result)
else:
print 'guard type "{}" is not a valid type.'.format(guard['type'])
continue
if len(matches) == 0:
return False
for match in matches:
# print match.value, regex
if not re.match(regex, match.value):
return False
except Exception as e:
print 'could not apply guard: {}'.format(guard)
return True

def run_readability(self, content_extraction, html, re_extractor):
recall_priority = False
field_name = None
Expand Down Expand Up @@ -2287,12 +2314,13 @@ def create_kg_node_extractor(ds, config, doc, parent_doc_id, doc_id=None, url=No
result = dict()
result['@timestamp_created'] = timestamp_created

# result[_PARENT_DOC_ID] = parent_doc_id
result[_PARENT_DOC_ID] = parent_doc_id
result[_CREATED_BY] = 'etk'
if url:
result[_URL] = url

result[_CONTENT_EXTRACTION] = dict()
result[_RAW_CONTENT] = str(d)

if not doc_id:
if isinstance(d, basestring) or isinstance(d, numbers.Number):
Expand Down Expand Up @@ -2322,4 +2350,4 @@ def create_kg_node_extractor(ds, config, doc, parent_doc_id, doc_id=None, url=No

doc['nested_docs'].append(result)
extractions.append({'value': doc_id, 'metadata': {'timestamp_created': timestamp_created}})
return extractions
return extractions
31 changes: 24 additions & 7 deletions etk/data_extractors/table_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def wrap_context(self, text):
def extract(self, table, dic):
# print dic
if table['features']['max_cols_in_a_row'] != 2 and table['features']['no_of_rows'] < 2:
return None
return []
res = []
for row in table['rows']:
if len(row['cells']) != 2:
Expand Down Expand Up @@ -202,7 +202,7 @@ def extract(self, html_doc, min_data_rows = 1):
for index_row, row in enumerate(rows):
row_dict = dict()
soup_row = BeautifulSoup(row, 'html.parser')
row_data = ''.join(soup_row.stripped_strings)
row_data = ' '.join(soup_row.stripped_strings)
row_data = row_data.replace("\\t", "").replace("\\r", "").replace("\\n", "")
if row_data != '':
row_len_list.append(len(row_data))
Expand All @@ -224,15 +224,15 @@ def extract(self, html_doc, min_data_rows = 1):
cell_dict = dict()
cell_dict["cell"] = str(td)
# cell_dict["text"] = [{"result": {"value": ''.join(td.stripped_strings)}}]
cell_dict["text"] = ''.join(td.stripped_strings)
cell_dict["text"] = ' '.join(td.stripped_strings)
# cell_dict["id"] = 'row_{0}_col_{1}'.format(index_row, index_col)
avg_cell_len += len(cell_dict["text"])
cell_list.append(cell_dict)
for index_col, td in enumerate(soup_row.findAll('td')):
cell_dict = dict()
cell_dict["cell"] = str(td)
# cell_dict["text"] = [{"result": {"value": ''.join(td.stripped_strings)}}]
cell_dict["text"] = ''.join(td.stripped_strings)
cell_dict["text"] = ' '.join(td.stripped_strings)
# cell_dict["id"] = 'row_{0}_col_{1}'.format(index_row, index_col)
avg_cell_len += len(cell_dict["text"])
cell_list.append(cell_dict)
Expand Down Expand Up @@ -272,7 +272,7 @@ def extract(self, html_doc, min_data_rows = 1):
h_index = 0
h_bool = True
for col in row.findAll('th'):
col_content = ''.join(col.stripped_strings)
col_content = ' '.join(col.stripped_strings)
h_bool = False
if col_content is None:
continue
Expand All @@ -283,7 +283,7 @@ def extract(self, html_doc, min_data_rows = 1):
if(h_index == 1 and h_bool == False):
d_index = 1
for col in row.findAll('td'):
col_content = ''.join(col.stripped_strings)
col_content = ' '.join(col.stripped_strings)
if col_content is None:
d_index += 1
continue
Expand All @@ -292,7 +292,7 @@ def extract(self, html_doc, min_data_rows = 1):
d_index += 1

for key, value in col_data.iteritems():
whole_col = ''.join(value)
whole_col = ' '.join(value)
# avg_cell_len += float("%.2f" % mean([len(x) for x in value]))
avg_col_len += sum([len(x) for x in value])
avg_col_len_dev += TableExtraction.pstdev([len(x) for x in value])
Expand Down Expand Up @@ -328,6 +328,23 @@ def create_fingerprint(table):
fingerprint = '-'.join(all_tokens)
return fingerprint

@staticmethod
def row_to_text(cells):
res = ''
for c in cells:
res += c['text'] + ' | '
return res

@staticmethod
def table_to_text(rows):
res = ''
for row in rows:
for c in row['cells']:
res += c['text'] + ' | '
res += '\n'
return res


@staticmethod
def gen_html(row_list):
""" Return html table string from a list of data rows """
Expand Down
146 changes: 143 additions & 3 deletions etk/resources/extraction_config_table_content.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,21 @@
"landmark": [
],
"pickle": {
"my_dic": "/Users/majid/DIG/test_fields.txt"
"calibre_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/calibre_dict.txt",
"capacity_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/capacity_dict.txt",
"length_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/length_dict.txt",
"finish_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/finish_dict.txt",
"manufacturer_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/manufacturer_dict.txt",
"weight_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/weight_dict.txt",
"seller_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/seller_dict.txt",
"id_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/id_dict.txt",
"action_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/action_dict.txt",
"price_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/price_dict.txt",
"model_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/model_dict.txt",
"condition_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/condition_dict.txt",
"description_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/description_dict.txt",
"address_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/address_dict.txt",
"country_dict": "/Users/majid/DIG/DIG/test_table_atf/table_glossaries/country_dict.txt"
}
},
"content_extraction": {
Expand All @@ -29,11 +43,137 @@
"*.table.tables[*]"
],
"fields": {
"my_field": {
"calibre": {
"extractors": {
"entity_table_extractor": {
"config": {
"dic": "my_dic"
"dic": "calibre_dict"
}
}
}
},
"capacity": {
"extractors": {
"entity_table_extractor": {
"config": {
"dic": "capacity_dict"
}
}
}
},
"length": {
"extractors": {
"entity_table_extractor": {
"config": {
"dic": "length_dict"
}
}
}
},
"finish": {
"extractors": {
"entity_table_extractor": {
"config": {
"dic": "finish_dict"
}
}
}
},
"manufacturer": {
"extractors": {
"entity_table_extractor": {
"config": {
"dic": "manufacturer_dict"
}
}
}
},
"weight": {
"extractors": {
"entity_table_extractor": {
"config": {
"dic": "weight_dict"
}
}
}
},
"seller": {
"extractors": {
"entity_table_extractor": {
"config": {
"dic": "seller_dict"
}
}
}
},
"id": {
"extractors": {
"entity_table_extractor": {
"config": {
"dic": "id_dict"
}
}
}
},
"action": {
"extractors": {
"entity_table_extractor": {
"config": {
"dic": "action_dict"
}
}
}
},
"price": {
"extractors": {
"entity_table_extractor": {
"config": {
"dic": "price_dict"
}
}
}
},
"model": {
"extractors": {
"entity_table_extractor": {
"config": {
"dic": "model_dict"
}
}
}
},
"condition": {
"extractors": {
"entity_table_extractor": {
"config": {
"dic": "condition_dict"
}
}
}
},
"description": {
"extractors": {
"entity_table_extractor": {
"config": {
"dic": "description_dict"
}
}
}
},
"address": {
"extractors": {
"entity_table_extractor": {
"config": {
"dic": "address_dict"
}
}
}
},
"country": {
"extractors": {
"entity_table_extractor": {
"config": {
"dic": "country_dict"
}
}
}
Expand Down
1 change: 1 addition & 0 deletions etk/unit_tests/ground_truth/nested_doc.jl

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions etk/unit_tests/ground_truth/nested_doc.out.jl

Large diffs are not rendered by default.

Loading

0 comments on commit 184bc8c

Please sign in to comment.