From 3b462e19619aa751821f38f87a918f10b4ef948e Mon Sep 17 00:00:00 2001 From: Amandeep Singh Date: Thu, 25 Jan 2018 14:10:05 -0800 Subject: [PATCH] use tld from doc if present --- etk/core.py | 6 ++- etk/unit_tests/test_content_extractions.py | 57 ++++++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/etk/core.py b/etk/core.py index ae6bf958..4e5147cb 100644 --- a/etk/core.py +++ b/etk/core.py @@ -128,6 +128,7 @@ _EXTRACT_ADDRESS = "extract_address" _EXTRACT_AGE = "extract_age" _CREATE_KG_NODE_EXTRACTOR = "create_kg_node_extractor" +_EXTRACT_WEBSITE_DOMAIN = "extract_website_domain" _ADD_CONSTANT_KG = "add_constant_kg" _GUARD = "guard" _GUARDS = "guards" @@ -558,6 +559,9 @@ def process(self, doc, create_knowledge_graph=True, html_description=False): field, results) else: + if extractor == _EXTRACT_WEBSITE_DOMAIN: + if _TLD in doc: + extractors[extractor][_CONFIG][_TLD] = doc[_TLD] if extractor == _EXTRACT_AS_IS: segment = str(match.full_path) else: @@ -1479,7 +1483,7 @@ def _extract_using_dictionary(tokens, pre_process, trie, pre_filter, post_filter def extract_website_domain(self, d, config): text = d[_TEXT] field_name = config[_FIELD_NAME] - tld = self.extract_tld(text) + tld = config[_TLD] if _TLD in config else self.extract_tld(text) results = {"value": tld} return self._relevant_text_from_context(d[_TEXT], results, field_name) diff --git a/etk/unit_tests/test_content_extractions.py b/etk/unit_tests/test_content_extractions.py index df334c87..cca040c2 100644 --- a/etk/unit_tests/test_content_extractions.py +++ b/etk/unit_tests/test_content_extractions.py @@ -644,6 +644,63 @@ def test_extract_as_is_data(self): self.assertTrue(r['knowledge_graph']['actor_description'][0]['data'][0]['description'] in ['Non-State, Internal, No State Sanction', 'Noncombatant Status Asserted']) + def test_tld_extraction(self): + doc = { + "url": "https://www.google.com/blah/this/part/doesnt/matter", + 'uri': "uri.1" + } + e_config = { + + "document_id": "uri", + "content_extraction": {}, + "data_extraction": [ + { + "input_path": "content_extraction.url.text.`parent`", + "fields": { + "website": { + "extractors": { + "extract_website_domain": { + } + } + + } + } + } + ] + } + c = Core(extraction_config=e_config) + r = c.process(doc) + self.assertEqual(r['knowledge_graph']['website'][0]['value'], 'google.com') + + def test_tld_extraction_from_doc(self): + doc = { + "url": "https://www.google.com/blah/this/part/doesnt/matter", + 'uri': "uri.1", + "tld": "xyz.org" + } + e_config = { + + "document_id": "uri", + "content_extraction": {}, + "data_extraction": [ + { + "input_path": "content_extraction.url.text.`parent`", + "fields": { + "website": { + "extractors": { + "extract_website_domain": { + } + } + + } + } + } + ] + } + c = Core(extraction_config=e_config) + r = c.process(doc) + self.assertEqual(r['knowledge_graph']['website'][0]['value'], 'xyz.org') + if __name__ == '__main__': unittest.main()