Skip to content

Commit

Permalink
use tld from doc if present
Browse files Browse the repository at this point in the history
  • Loading branch information
saggu committed Jan 25, 2018
1 parent 627347d commit 3b462e1
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 1 deletion.
6 changes: 5 additions & 1 deletion etk/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@
_EXTRACT_ADDRESS = "extract_address"
_EXTRACT_AGE = "extract_age"
_CREATE_KG_NODE_EXTRACTOR = "create_kg_node_extractor"
_EXTRACT_WEBSITE_DOMAIN = "extract_website_domain"
_ADD_CONSTANT_KG = "add_constant_kg"
_GUARD = "guard"
_GUARDS = "guards"
Expand Down Expand Up @@ -558,6 +559,9 @@ def process(self, doc, create_knowledge_graph=True, html_description=False):
field,
results)
else:
if extractor == _EXTRACT_WEBSITE_DOMAIN:
if _TLD in doc:
extractors[extractor][_CONFIG][_TLD] = doc[_TLD]
if extractor == _EXTRACT_AS_IS:
segment = str(match.full_path)
else:
Expand Down Expand Up @@ -1479,7 +1483,7 @@ def _extract_using_dictionary(tokens, pre_process, trie, pre_filter, post_filter
def extract_website_domain(self, d, config):
text = d[_TEXT]
field_name = config[_FIELD_NAME]
tld = self.extract_tld(text)
tld = config[_TLD] if _TLD in config else self.extract_tld(text)
results = {"value": tld}
return self._relevant_text_from_context(d[_TEXT], results, field_name)

Expand Down
57 changes: 57 additions & 0 deletions etk/unit_tests/test_content_extractions.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,63 @@ def test_extract_as_is_data(self):
self.assertTrue(r['knowledge_graph']['actor_description'][0]['data'][0]['description'] in
['Non-State, Internal, No State Sanction', 'Noncombatant Status Asserted'])

def test_tld_extraction(self):
doc = {
"url": "https://www.google.com/blah/this/part/doesnt/matter",
'uri': "uri.1"
}
e_config = {

"document_id": "uri",
"content_extraction": {},
"data_extraction": [
{
"input_path": "content_extraction.url.text.`parent`",
"fields": {
"website": {
"extractors": {
"extract_website_domain": {
}
}

}
}
}
]
}
c = Core(extraction_config=e_config)
r = c.process(doc)
self.assertEqual(r['knowledge_graph']['website'][0]['value'], 'google.com')

def test_tld_extraction_from_doc(self):
doc = {
"url": "https://www.google.com/blah/this/part/doesnt/matter",
'uri': "uri.1",
"tld": "xyz.org"
}
e_config = {

"document_id": "uri",
"content_extraction": {},
"data_extraction": [
{
"input_path": "content_extraction.url.text.`parent`",
"fields": {
"website": {
"extractors": {
"extract_website_domain": {
}
}

}
}
}
]
}
c = Core(extraction_config=e_config)
r = c.process(doc)
self.assertEqual(r['knowledge_graph']['website'][0]['value'], 'xyz.org')


if __name__ == '__main__':
unittest.main()

0 comments on commit 3b462e1

Please sign in to comment.