Skip to content

Commit

Permalink
remove empty domain
Browse files Browse the repository at this point in the history
  • Loading branch information
rom1504 committed Dec 8, 2023
1 parent 9af3383 commit 076358e
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion cc2dataset/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,14 +106,15 @@ def extract_domain(url):
for extractor in FILTERED_EXTRACTORS.values():
for url in extract_test(extractor):
domain = extract_domain(url)
if domain == "":
continue
if domain in DOMAIN_IES_DICT:
if extractor not in DOMAIN_IES_DICT[domain]:
DOMAIN_IES_DICT[domain] = DOMAIN_IES_DICT[domain] + [extractor]
else:
DOMAIN_IES_DICT[domain] = [extractor]



def is_link_suitable(link, extractors):
"""Check if link is valid given an extractor."""
try:
Expand Down

0 comments on commit 076358e

Please sign in to comment.