Skip to content

Commit

Permalink
Sites checks fixes (#1896)
Browse files Browse the repository at this point in the history
Fixed incorrect site names, added method to compare sites
  • Loading branch information
soxoj authored Nov 26, 2024
1 parent f529d16 commit b370bc4
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 21 deletions.
35 changes: 28 additions & 7 deletions maigret/resources/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -31025,55 +31025,55 @@
"qa-part-form-profile"
]
},
".com": {
"{username}.com": {
"protocol": "dns",
"url": "{username}.com",
"urlMain": "{username}.com",
"usernameClaimed": "soxoj",
"usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "status_code"
},
".pro": {
"{username}.pro": {
"protocol": "dns",
"url": "{username}.pro",
"urlMain": "{username}.pro",
"usernameClaimed": "alex",
"usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "status_code"
},
".me": {
"{username}.me": {
"protocol": "dns",
"url": "{username}.me",
"urlMain": "{username}.me",
"usernameClaimed": "alex",
"usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "status_code"
},
".biz": {
"{username}.biz": {
"protocol": "dns",
"url": "{username}.biz",
"urlMain": "{username}.biz",
"usernameClaimed": "alex",
"usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "status_code"
},
".email": {
"{username}.email": {
"protocol": "dns",
"url": "{username}.email",
"urlMain": "{username}.email",
"usernameClaimed": "phone",
"usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "status_code"
},
".guru": {
"{username}.guru": {
"protocol": "dns",
"url": "{username}.guru",
"urlMain": "{username}.guru",
"usernameClaimed": "alex",
"usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "status_code"
},
".ddns.net": {
"{username}.ddns.net": {
"protocol": "dns",
"url": "{username}.ddns.net",
"urlMain": "{username}.ddns.net",
Expand Down Expand Up @@ -35201,6 +35201,27 @@
"urlMain": "https://massagerepublic.com",
"usernameClaimed": "lily88",
"usernameUnclaimed": "xzhsxfyfzi"
},
"mynickname.com": {
"checkType": "message",
"absenceStrs": [
"<h1>Error 404: Page not found</h1>",
"Nickname , certificate for username ",
"btn green",
"mailto:[email protected]",
">Register nickname</span></a></p>"
],
"presenseStrs": [
" title=",
"bold",
"title-line",
"codehtml",
"User offline"
],
"url": "https://mynickname.com/{username}",
"urlMain": "https://mynickname.com",
"usernameClaimed": "godbrithil",
"usernameUnclaimed": "fqiakbtdhu"
}
},
"engines": {
Expand Down
44 changes: 44 additions & 0 deletions maigret/sites.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,36 @@ def __init__(self, name, information):
def __str__(self):
return f"{self.name} ({self.url_main})"

def __is_equal_by_url_or_name(self, url_or_name_str: str):
lower_url_or_name_str = url_or_name_str.lower()
lower_url = self.url.lower()
lower_name = self.name.lower()
lower_url_main = self.url_main.lower()

return \
lower_name == lower_url_or_name_str or \
(lower_url_main and lower_url_main == lower_url_or_name_str) or \
(lower_url_main and lower_url_main in lower_url_or_name_str) or \
(lower_url_main and lower_url_or_name_str in lower_url_main) or \
(lower_url and lower_url_or_name_str in lower_url)

def __eq__(self, other):
if isinstance(other, MaigretSite):
# Compare only relevant attributes, not internal state like request_future
attrs_to_compare = ['name', 'url_main', 'url_subpath', 'type', 'headers',
'errors', 'activation', 'regex_check', 'url_probe',
'check_type', 'request_head_only', 'get_params',
'presense_strs', 'absence_strs', 'stats', 'engine',
'engine_data', 'alexa_rank', 'source', 'protocol']

return all(getattr(self, attr) == getattr(other, attr)
for attr in attrs_to_compare)
elif isinstance(other, str):
# Compare only by name (exactly) or url_main (partial similarity)
return self.__is_equal_by_url_or_name(other)
return False


def update_detectors(self):
if "url" in self.__dict__:
url = self.url
Expand All @@ -101,6 +131,10 @@ def detect_username(self, url: str) -> Optional[str]:
return None

def extract_id_from_url(self, url: str) -> Optional[Tuple[str, str]]:
"""
Extracts username from url.
It's outdated, detects only a format of https://example.com/{username}
"""
if not self.url_regexp:
return None

Expand Down Expand Up @@ -223,6 +257,16 @@ def sites(self):
def sites_dict(self):
return {site.name: site for site in self._sites}

def has_site(self, site: MaigretSite):
for s in self._sites:
if site == s:
print(f"input == site: {site} == {s}")
return True
return False

def __contains__(self, site):
return self.has_site(site)

def ranked_sites_dict(
self,
reverse=False,
Expand Down
5 changes: 5 additions & 0 deletions maigret/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,11 @@ async def site_self_check(self, site, semaphore, silent=False):

self.logger.info(f"Site {site.name} checking is finished")

# remove service tag "unchecked"
if "unchecked" in site.tags:
site.tags.remove("unchecked")
changes["tags"] = site.tags

return changes

def generate_additional_fields_dialog(self, engine: MaigretEngine, dialog):
Expand Down
30 changes: 16 additions & 14 deletions sites.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

## List of supported sites (search methods): total 3125
## List of supported sites (search methods): total 3126

Rank data fetched from Alexa by domains.

Expand Down Expand Up @@ -2864,13 +2864,13 @@ Rank data fetched from Alexa by domains.
1. ![](https://www.google.com/s2/favicons?domain=https://ovnl.in) [ovnl.in (https://ovnl.in)](https://ovnl.in)*: top 100M, forum*, search is disabled
1. ![](https://www.google.com/s2/favicons?domain=https://wls.social) [wls.social (https://wls.social)](https://wls.social)*: top 100M, blog*, search is disabled
1. ![](https://www.google.com/s2/favicons?domain=http://answerszuvs3gg2l64e6hmnryudl5zgrmwm3vh65hzszdghblddvfiqd.onion) [HiddenAnswers (http://answerszuvs3gg2l64e6hmnryudl5zgrmwm3vh65hzszdghblddvfiqd.onion)](http://answerszuvs3gg2l64e6hmnryudl5zgrmwm3vh65hzszdghblddvfiqd.onion)*: top 100M, q&a, tor*
1. ![](https://www.google.com/s2/favicons?domain={username}.com) [.com ({username}.com)]({username}.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.pro) [.pro ({username}.pro)]({username}.pro)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.me) [.me ({username}.me)]({username}.me)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.biz) [.biz ({username}.biz)]({username}.biz)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.email) [.email ({username}.email)]({username}.email)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.guru) [.guru ({username}.guru)]({username}.guru)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.ddns.net) [.ddns.net ({username}.ddns.net)]({username}.ddns.net)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.com) [{username}.com ({username}.com)]({username}.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.pro) [{username}.pro ({username}.pro)]({username}.pro)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.me) [{username}.me ({username}.me)]({username}.me)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.biz) [{username}.biz ({username}.biz)]({username}.biz)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.email) [{username}.email ({username}.email)]({username}.email)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.guru) [{username}.guru ({username}.guru)]({username}.guru)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.ddns.net) [{username}.ddns.net ({username}.ddns.net)]({username}.ddns.net)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=http://forum-history.ru) [forum-history.ru (http://forum-history.ru)](http://forum-history.ru)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://forum.alconar.ru) [forum.alconar.ru (https://forum.alconar.ru)](https://forum.alconar.ru)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://krskforum.com) [krskforum.com (https://krskforum.com)](https://krskforum.com)*: top 100M*
Expand Down Expand Up @@ -3117,6 +3117,7 @@ Rank data fetched from Alexa by domains.
1. ![](https://www.google.com/s2/favicons?domain=https://www.stopstalk.com) [www.stopstalk.com (https://www.stopstalk.com)](https://www.stopstalk.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://www.polywork.com) [www.polywork.com (https://www.polywork.com)](https://www.polywork.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://oshwlab.com) [oshwlab.com (https://oshwlab.com)](https://oshwlab.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://www.xshaker.net) [www.xshaker.net (https://www.xshaker.net)](https://www.xshaker.net)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://chaturbator.su) [chaturbator.su (https://chaturbator.su)](https://chaturbator.su)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://imgflip.com) [imgflip.com (https://imgflip.com)](https://imgflip.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://www.flickr.com) [www.flickr.com (https://www.flickr.com)](https://www.flickr.com)*: top 100M*
Expand All @@ -3127,21 +3128,22 @@ Rank data fetched from Alexa by domains.
1. ![](https://www.google.com/s2/favicons?domain=https://archive.transformativeworks.org) [archive.transformativeworks.org (https://archive.transformativeworks.org)](https://archive.transformativeworks.org)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://www.tnaflix.com) [www.tnaflix.com (https://www.tnaflix.com)](https://www.tnaflix.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://massagerepublic.com) [massagerepublic.com (https://massagerepublic.com)](https://massagerepublic.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://mynickname.com) [mynickname.com (https://mynickname.com)](https://mynickname.com)*: top 100M, unchecked*

The list was updated at (2024-11-25 17:22:43.959448+00:00 UTC)
The list was updated at (2024-11-26 10:27:01.383232+00:00 UTC)
## Statistics

Enabled/total sites: 2693/3125 = 86.18%
Enabled/total sites: 2694/3126 = 86.18%

Incomplete message checks: 405/2693 = 15.04% (false positive risks)
Incomplete message checks: 405/2694 = 15.03% (false positive risks)

Status code checks: 720/2693 = 26.74% (false positive risks)
Status code checks: 720/2694 = 26.73% (false positive risks)

False positive risk (total): 41.78%
False positive risk (total): 41.76%

Top 20 profile URLs:
- (796) `{urlMain}/index/8-0-{username} (uCoz)`
- (301) `/{username}`
- (302) `/{username}`
- (221) `{urlMain}{urlSubpath}/members/?username={username} (XenForo)`
- (160) `/user/{username}`
- (133) `{urlMain}{urlSubpath}/member.php?username={username} (vBulletin)`
Expand Down
17 changes: 17 additions & 0 deletions tests/test_sites.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,3 +202,20 @@ def test_get_url_template():
},
)
assert site.get_url_template() == "SUBDOMAIN"


def test_has_site_url_or_name(default_db):
# by the same url or partial match
assert default_db.has_site("https://aback.com.ua/user/") == True
assert default_db.has_site("https://aback.com.ua") == True

# acceptable partial match
assert default_db.has_site("https://aback.com.ua/use") == True
assert default_db.has_site("https://aback.com") == True

# by name
assert default_db.has_site("Aback") == True

# false
assert default_db.has_site("https://aeifgoai3h4g8a3u4g5") == False
assert default_db.has_site("aeifgoai3h4g8a3u4g5") == False

0 comments on commit b370bc4

Please sign in to comment.