From 3b462e19619aa751821f38f87a918f10b4ef948e Mon Sep 17 00:00:00 2001
From: Amandeep Singh <amandeep.s.saggu@gmail.com>
Date: Thu, 25 Jan 2018 14:10:05 -0800
Subject: [PATCH] use tld from doc if present

---
 etk/core.py                                |  6 ++-
 etk/unit_tests/test_content_extractions.py | 57 ++++++++++++++++++++++
 2 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/etk/core.py b/etk/core.py
index ae6bf958..4e5147cb 100644
--- a/etk/core.py
+++ b/etk/core.py
@@ -128,6 +128,7 @@
 _EXTRACT_ADDRESS = "extract_address"
 _EXTRACT_AGE = "extract_age"
 _CREATE_KG_NODE_EXTRACTOR = "create_kg_node_extractor"
+_EXTRACT_WEBSITE_DOMAIN = "extract_website_domain"
 _ADD_CONSTANT_KG = "add_constant_kg"
 _GUARD = "guard"
 _GUARDS = "guards"
@@ -558,6 +559,9 @@ def process(self, doc, create_knowledge_graph=True, html_description=False):
                                                                                                                 field,
                                                                                                                 results)
                                                             else:
+                                                                if extractor == _EXTRACT_WEBSITE_DOMAIN:
+                                                                    if _TLD in doc:
+                                                                        extractors[extractor][_CONFIG][_TLD] = doc[_TLD]
                                                                 if extractor == _EXTRACT_AS_IS:
                                                                     segment = str(match.full_path)
                                                                 else:
@@ -1479,7 +1483,7 @@ def _extract_using_dictionary(tokens, pre_process, trie, pre_filter, post_filter
     def extract_website_domain(self, d, config):
         text = d[_TEXT]
         field_name = config[_FIELD_NAME]
-        tld = self.extract_tld(text)
+        tld = config[_TLD] if _TLD in config else self.extract_tld(text)
         results = {"value": tld}
         return self._relevant_text_from_context(d[_TEXT], results, field_name)
 
diff --git a/etk/unit_tests/test_content_extractions.py b/etk/unit_tests/test_content_extractions.py
index df334c87..cca040c2 100644
--- a/etk/unit_tests/test_content_extractions.py
+++ b/etk/unit_tests/test_content_extractions.py
@@ -644,6 +644,63 @@ def test_extract_as_is_data(self):
         self.assertTrue(r['knowledge_graph']['actor_description'][0]['data'][0]['description'] in
                         ['Non-State, Internal, No State Sanction', 'Noncombatant Status Asserted'])
 
+    def test_tld_extraction(self):
+        doc = {
+            "url": "https://www.google.com/blah/this/part/doesnt/matter",
+            'uri': "uri.1"
+        }
+        e_config = {
+
+            "document_id": "uri",
+            "content_extraction": {},
+            "data_extraction": [
+                {
+                    "input_path": "content_extraction.url.text.`parent`",
+                    "fields": {
+                        "website": {
+                            "extractors": {
+                                "extract_website_domain": {
+                                }
+                            }
+
+                        }
+                    }
+                }
+            ]
+        }
+        c = Core(extraction_config=e_config)
+        r = c.process(doc)
+        self.assertEqual(r['knowledge_graph']['website'][0]['value'], 'google.com')
+
+    def test_tld_extraction_from_doc(self):
+        doc = {
+            "url": "https://www.google.com/blah/this/part/doesnt/matter",
+            'uri': "uri.1",
+            "tld": "xyz.org"
+        }
+        e_config = {
+
+            "document_id": "uri",
+            "content_extraction": {},
+            "data_extraction": [
+                {
+                    "input_path": "content_extraction.url.text.`parent`",
+                    "fields": {
+                        "website": {
+                            "extractors": {
+                                "extract_website_domain": {
+                                }
+                            }
+
+                        }
+                    }
+                }
+            ]
+        }
+        c = Core(extraction_config=e_config)
+        r = c.process(doc)
+        self.assertEqual(r['knowledge_graph']['website'][0]['value'], 'xyz.org')
+
 
 if __name__ == '__main__':
     unittest.main()