diff --git a/cc2dataset/main.py b/cc2dataset/main.py index 8e4283c..81656d2 100644 --- a/cc2dataset/main.py +++ b/cc2dataset/main.py @@ -106,6 +106,8 @@ def extract_domain(url): for extractor in FILTERED_EXTRACTORS.values(): for url in extract_test(extractor): domain = extract_domain(url) + if domain == "": + continue if domain in DOMAIN_IES_DICT: if extractor not in DOMAIN_IES_DICT[domain]: DOMAIN_IES_DICT[domain] = DOMAIN_IES_DICT[domain] + [extractor] @@ -113,7 +115,6 @@ def extract_domain(url): DOMAIN_IES_DICT[domain] = [extractor] - def is_link_suitable(link, extractors): """Check if link is valid given an extractor.""" try: