From f04de78682046e1909442c462c9279279178bb17 Mon Sep 17 00:00:00 2001 From: Soxoj <31013580+soxoj@users.noreply.github.com> Date: Fri, 6 Dec 2024 01:35:19 +0100 Subject: [PATCH] Activation mechanism documentation added (#1935) Few site checks fixed --- docs/source/development.rst | 59 ++++++++++++++++++++++++++++++++++++ docs/source/features.rst | 24 ++++++++++++--- maigret/resources/data.json | 57 +++++++++++++++++++---------------- sites.md | 60 ++++++++++++++++++------------------- tests/test_activation.py | 11 ++++--- 5 files changed, 145 insertions(+), 66 deletions(-) diff --git a/docs/source/development.rst b/docs/source/development.rst index 11100e61..64d75f6a 100644 --- a/docs/source/development.rst +++ b/docs/source/development.rst @@ -110,6 +110,65 @@ There are few options for sites data.json helpful in various cases: - ``requestHeadOnly`` - set to ``true`` if it's enough to make a HEAD request to the site - ``regexCheck`` - a regex to check if the username is valid, in case of frequent false-positives +.. _activation-mechanism: + +Activation mechanism +-------------------- + +The activation mechanism helps make requests to sites requiring additional authentication like cookies, JWT tokens, or custom headers. + +Let's study the Vimeo site check record from the Maigret database: + +.. code-block:: json + + "Vimeo": { + "tags": [ + "us", + "video" + ], + "headers": { + "Authorization": "jwt eyJ0..." + }, + "activation": { + "url": "https://vimeo.com/_rv/viewer", + "marks": [ + "Something strange occurred. Please get in touch with the app's creator." + ], + "method": "vimeo" + }, + "urlProbe": "https://api.vimeo.com/users/{username}?fields=name...", + "checkType": "status_code", + "alexaRank": 148, + "urlMain": "https://vimeo.com/", + "url": "https://vimeo.com/{username}", + "usernameClaimed": "blue", + "usernameUnclaimed": "noonewouldeverusethis7" + }, + +The activation method is: + +.. code-block:: python + + def vimeo(site, logger, cookies={}): + headers = dict(site.headers) + if "Authorization" in headers: + del headers["Authorization"] + import requests + + r = requests.get(site.activation["url"], headers=headers) + jwt_token = r.json()["jwt"] + site.headers["Authorization"] = "jwt " + jwt_token + +Here's how the activation process works when a JWT token becomes invalid: + +1. The site check makes an HTTP request to ``urlProbe`` with the invalid token +2. The response contains an error message specified in the ``activation``/``marks`` field +3. When this error is detected, the ``vimeo`` activation function is triggered +4. The activation function obtains a new JWT token and updates it in the site check record +5. On the next site check (either through retry or a new Maigret run), the valid token is used and the check succeeds + +Examples of activation mechanism implementation are available in `activation.py `_ file. + How to publish new version of Maigret ------------------------------------- diff --git a/docs/source/features.rst b/docs/source/features.rst index 84f1d598..28dc1cfe 100644 --- a/docs/source/features.rst +++ b/docs/source/features.rst @@ -147,16 +147,32 @@ Archives and mirrors checking The Maigret database contains not only the original websites, but also mirrors, archives, and aggregators. For example: -- `Reddit BigData search `_ - `Picuki `_, Instagram mirror -- `Twitter shadowban `_ checker +- (no longer available) `Reddit BigData search `_ +- (no longer available) `Twitter shadowban `_ checker It allows getting additional info about the person and checking the existence of the account even if the main site is unavailable (bot protection, captcha, etc.) +Activation +---------- +The activation mechanism helps make requests to sites requiring additional authentication like cookies, JWT tokens, or custom headers. + +It works by implementing a custom function that: + +1. Makes a specialized HTTP request to a specific website endpoint +2. Processes the response +3. Updates the headers/cookies for that site in the local Maigret database + +Since activation only triggers after encountering specific errors, a retry (or another Maigret run) is needed to obtain a valid response with the updated authentication. + +The activation mechanism is enabled by default, and cannot be disabled at the moment. + +See for more details in Development section :ref:`activation-mechanism`. + .. _extracting-information-from-pages: -Extractiion of information from account pages ---------------------------------------------- +Extraction of information from account pages +-------------------------------------------- Maigret can parse URLs and content of web pages by URLs to extract info about account owner and other meta information. diff --git a/maigret/resources/data.json b/maigret/resources/data.json index 5b6505c9..c09635d7 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -5260,19 +5260,18 @@ "regexCheck": "^[a-zA-Z0-9_\\.]{3,49}(?