From 5518ae2d942ca0288f1d0e635f730d49a0932258 Mon Sep 17 00:00:00 2001 From: James Addison Date: Fri, 19 Apr 2024 17:17:38 +0100 Subject: [PATCH 01/15] MarleySpoon: add precautionary check for unexpected API URLs. --- recipe_scrapers/marleyspoon.py | 15 ++++++++++++++- tests/legacy/test_data/faulty.testhtml | 6 ++++++ tests/legacy/test_marleyspoon_invalid.py | 24 ++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 tests/legacy/test_data/faulty.testhtml create mode 100644 tests/legacy/test_marleyspoon_invalid.py diff --git a/recipe_scrapers/marleyspoon.py b/recipe_scrapers/marleyspoon.py index 1325a201b..989fe5690 100644 --- a/recipe_scrapers/marleyspoon.py +++ b/recipe_scrapers/marleyspoon.py @@ -1,11 +1,12 @@ # mypy: disallow_untyped_defs=False +from urllib.parse import urljoin, urlsplit import json import re import requests from ._abstract import HEADERS, AbstractScraper -from ._exceptions import ElementNotFoundInHtml +from ._exceptions import ElementNotFoundInHtml, RecipeScrapersExceptions from ._utils import normalize_string ID_PATTERN = re.compile(r"/(\d+)-") @@ -65,6 +66,18 @@ def _get_json_params(self): if api_url is None or api_token is None: raise ElementNotFoundInHtml("Required script not found.") + scraper_name = self.__class__.__name__ + expected_domain = scraper_name.lower() + try: + api_url = urljoin(self.url, api_url) + url_info = urlsplit(api_url) + domain_prefix, _ = url_info.hostname.rsplit(".", 1) + if not f".{domain_prefix}".endswith(f".{expected_domain}"): + msg = f"Domain for {api_url} does not contain expected part: {expected_domain}" + raise ValueError(msg) + except Exception: + raise RecipeScrapersExceptions(f"Unexpected API URL: {api_url}") + return api_url, api_token @classmethod diff --git a/tests/legacy/test_data/faulty.testhtml b/tests/legacy/test_data/faulty.testhtml new file mode 100644 index 000000000..b0c4f2998 --- /dev/null +++ b/tests/legacy/test_data/faulty.testhtml @@ -0,0 +1,6 @@ + + + + diff --git a/tests/legacy/test_marleyspoon_invalid.py b/tests/legacy/test_marleyspoon_invalid.py new file mode 100644 index 000000000..b6b5426f3 --- /dev/null +++ b/tests/legacy/test_marleyspoon_invalid.py @@ -0,0 +1,24 @@ +import unittest + +import responses + +from recipe_scrapers._exceptions import RecipeScrapersExceptions +from recipe_scrapers.marleyspoon import MarleySpoon + + +class TestFaultyAPIURLResponse(unittest.TestCase): + + @responses.activate + def test_invalid_scraper(self): + valid_url = "https://marleyspoon.de/menu/113813-glasierte-veggie-burger-mit-roestkartoffeln-und-apfel-gurken-salat" + with open("tests/legacy/test_data/faulty.testhtml", "r") as faulty_data: + faulty_response = faulty_data.read() + + responses.add( + method=responses.GET, + url=valid_url, + body=faulty_response, + ) + + with self.assertRaises(RecipeScrapersExceptions): + scraper = MarleySpoon(url=valid_url) From 06e5bf8a27c24e9b4cfedbfc496310b9ee2a6510 Mon Sep 17 00:00:00 2001 From: James Addison Date: Fri, 19 Apr 2024 17:25:04 +0100 Subject: [PATCH 02/15] Fixup: linting: remove unused variable. --- tests/legacy/test_marleyspoon_invalid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/legacy/test_marleyspoon_invalid.py b/tests/legacy/test_marleyspoon_invalid.py index b6b5426f3..98e4a0f3c 100644 --- a/tests/legacy/test_marleyspoon_invalid.py +++ b/tests/legacy/test_marleyspoon_invalid.py @@ -21,4 +21,4 @@ def test_invalid_scraper(self): ) with self.assertRaises(RecipeScrapersExceptions): - scraper = MarleySpoon(url=valid_url) + MarleySpoon(url=valid_url) From cf9c0594f5af2ae84c1ecac2ee29244dbebbd3e8 Mon Sep 17 00:00:00 2001 From: James Addison Date: Fri, 19 Apr 2024 17:27:23 +0100 Subject: [PATCH 03/15] Fixup: linting: use isort to re-order imports. --- recipe_scrapers/marleyspoon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipe_scrapers/marleyspoon.py b/recipe_scrapers/marleyspoon.py index 989fe5690..3676276b9 100644 --- a/recipe_scrapers/marleyspoon.py +++ b/recipe_scrapers/marleyspoon.py @@ -1,7 +1,7 @@ # mypy: disallow_untyped_defs=False -from urllib.parse import urljoin, urlsplit import json import re +from urllib.parse import urljoin, urlsplit import requests From c25b0e30c8155facdc2fe165adb5c30984580d6d Mon Sep 17 00:00:00 2001 From: James Addison Date: Fri, 19 Apr 2024 17:30:59 +0100 Subject: [PATCH 04/15] Fixup: linting: apply pyupgrade (py3.8+) to test module. --- tests/legacy/test_marleyspoon_invalid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/legacy/test_marleyspoon_invalid.py b/tests/legacy/test_marleyspoon_invalid.py index 98e4a0f3c..41527568c 100644 --- a/tests/legacy/test_marleyspoon_invalid.py +++ b/tests/legacy/test_marleyspoon_invalid.py @@ -11,7 +11,7 @@ class TestFaultyAPIURLResponse(unittest.TestCase): @responses.activate def test_invalid_scraper(self): valid_url = "https://marleyspoon.de/menu/113813-glasierte-veggie-burger-mit-roestkartoffeln-und-apfel-gurken-salat" - with open("tests/legacy/test_data/faulty.testhtml", "r") as faulty_data: + with open("tests/legacy/test_data/faulty.testhtml") as faulty_data: faulty_response = faulty_data.read() responses.add( From 5f2a6bdafbf3e9e55100d221a47e2f027210065a Mon Sep 17 00:00:00 2001 From: James Addison Date: Fri, 19 Apr 2024 17:38:50 +0100 Subject: [PATCH 05/15] MarleySpoon: remove use of variable shadowing that introduce a change-in-behaviour. --- recipe_scrapers/marleyspoon.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipe_scrapers/marleyspoon.py b/recipe_scrapers/marleyspoon.py index 3676276b9..9946d6546 100644 --- a/recipe_scrapers/marleyspoon.py +++ b/recipe_scrapers/marleyspoon.py @@ -69,8 +69,8 @@ def _get_json_params(self): scraper_name = self.__class__.__name__ expected_domain = scraper_name.lower() try: - api_url = urljoin(self.url, api_url) - url_info = urlsplit(api_url) + validation_url = urljoin(self.url, api_url) + url_info = urlsplit(validation_url) domain_prefix, _ = url_info.hostname.rsplit(".", 1) if not f".{domain_prefix}".endswith(f".{expected_domain}"): msg = f"Domain for {api_url} does not contain expected part: {expected_domain}" From 39cc78823c3fad1b745d40999c7e7c746fbd9db6 Mon Sep 17 00:00:00 2001 From: James Addison Date: Fri, 19 Apr 2024 17:41:55 +0100 Subject: [PATCH 06/15] MarleySpoon: tests: rename test case. --- tests/legacy/test_marleyspoon_invalid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/legacy/test_marleyspoon_invalid.py b/tests/legacy/test_marleyspoon_invalid.py index 41527568c..108f087ea 100644 --- a/tests/legacy/test_marleyspoon_invalid.py +++ b/tests/legacy/test_marleyspoon_invalid.py @@ -9,7 +9,7 @@ class TestFaultyAPIURLResponse(unittest.TestCase): @responses.activate - def test_invalid_scraper(self): + def test_faulty_response(self): valid_url = "https://marleyspoon.de/menu/113813-glasierte-veggie-burger-mit-roestkartoffeln-und-apfel-gurken-salat" with open("tests/legacy/test_data/faulty.testhtml") as faulty_data: faulty_response = faulty_data.read() From ca2154fb53d508bf28e0d036ff91ceff6c45981e Mon Sep 17 00:00:00 2001 From: James Addison Date: Fri, 19 Apr 2024 17:46:22 +0100 Subject: [PATCH 07/15] MarleySpoon: tests: add coverage relative-URL API host case. --- tests/legacy/test_data/relative_url.testhtml | 6 ++++++ tests/legacy/test_marleyspoon_invalid.py | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 tests/legacy/test_data/relative_url.testhtml diff --git a/tests/legacy/test_data/relative_url.testhtml b/tests/legacy/test_data/relative_url.testhtml new file mode 100644 index 000000000..16650db42 --- /dev/null +++ b/tests/legacy/test_data/relative_url.testhtml @@ -0,0 +1,6 @@ + + + + diff --git a/tests/legacy/test_marleyspoon_invalid.py b/tests/legacy/test_marleyspoon_invalid.py index 108f087ea..3f89239bc 100644 --- a/tests/legacy/test_marleyspoon_invalid.py +++ b/tests/legacy/test_marleyspoon_invalid.py @@ -22,3 +22,20 @@ def test_faulty_response(self): with self.assertRaises(RecipeScrapersExceptions): MarleySpoon(url=valid_url) + + @responses.activate + def test_relative_api_url(self): + valid_url = "https://marleyspoon.de/menu/113813-glasierte-veggie-burger-mit-roestkartoffeln-und-apfel-gurken-salat" + with open("tests/legacy/test_data/relative_url.testhtml") as relative_url_data: + relative_url_response = relative_url_data.read() + + responses.add( + method=responses.GET, + url=valid_url, + body=relative_url_response, + ) + + with self.assertRaises(Exception): + MarleySpoon( + url=valid_url + ) # currently this raises an requests.exceptions.MissingSchema exception From c24cc7b2b0f8d11fe79c66149e97577b8cbcc6ae Mon Sep 17 00:00:00 2001 From: James Addison Date: Fri, 19 Apr 2024 17:47:02 +0100 Subject: [PATCH 08/15] MarleySpoon: tests: brevity: rename 'valid_url' to 'url'. --- tests/legacy/test_marleyspoon_invalid.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/legacy/test_marleyspoon_invalid.py b/tests/legacy/test_marleyspoon_invalid.py index 3f89239bc..a8bde3b42 100644 --- a/tests/legacy/test_marleyspoon_invalid.py +++ b/tests/legacy/test_marleyspoon_invalid.py @@ -10,32 +10,32 @@ class TestFaultyAPIURLResponse(unittest.TestCase): @responses.activate def test_faulty_response(self): - valid_url = "https://marleyspoon.de/menu/113813-glasierte-veggie-burger-mit-roestkartoffeln-und-apfel-gurken-salat" + url = "https://marleyspoon.de/menu/113813-glasierte-veggie-burger-mit-roestkartoffeln-und-apfel-gurken-salat" with open("tests/legacy/test_data/faulty.testhtml") as faulty_data: faulty_response = faulty_data.read() responses.add( method=responses.GET, - url=valid_url, + url=url, body=faulty_response, ) with self.assertRaises(RecipeScrapersExceptions): - MarleySpoon(url=valid_url) + MarleySpoon(url=url) @responses.activate def test_relative_api_url(self): - valid_url = "https://marleyspoon.de/menu/113813-glasierte-veggie-burger-mit-roestkartoffeln-und-apfel-gurken-salat" + url = "https://marleyspoon.de/menu/113813-glasierte-veggie-burger-mit-roestkartoffeln-und-apfel-gurken-salat" with open("tests/legacy/test_data/relative_url.testhtml") as relative_url_data: relative_url_response = relative_url_data.read() responses.add( method=responses.GET, - url=valid_url, + url=url, body=relative_url_response, ) with self.assertRaises(Exception): MarleySpoon( - url=valid_url + url=url ) # currently this raises an requests.exceptions.MissingSchema exception From eb286cbf23316c1e130cea062330e2612f947344 Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 30 Apr 2024 13:15:38 +0100 Subject: [PATCH 09/15] MarleySpoon: adjustment: use is-same-scraper condition to decide whether a request is valid or not. --- recipe_scrapers/marleyspoon.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/recipe_scrapers/marleyspoon.py b/recipe_scrapers/marleyspoon.py index 9946d6546..e6afa1d83 100644 --- a/recipe_scrapers/marleyspoon.py +++ b/recipe_scrapers/marleyspoon.py @@ -7,7 +7,7 @@ from ._abstract import HEADERS, AbstractScraper from ._exceptions import ElementNotFoundInHtml, RecipeScrapersExceptions -from ._utils import normalize_string +from ._utils import get_host_name, normalize_string ID_PATTERN = re.compile(r"/(\d+)-") SCRIPT_PATTERN = re.compile( @@ -67,14 +67,12 @@ def _get_json_params(self): raise ElementNotFoundInHtml("Required script not found.") scraper_name = self.__class__.__name__ - expected_domain = scraper_name.lower() try: - validation_url = urljoin(self.url, api_url) - url_info = urlsplit(validation_url) - domain_prefix, _ = url_info.hostname.rsplit(".", 1) - if not f".{domain_prefix}".endswith(f".{expected_domain}"): - msg = f"Domain for {api_url} does not contain expected part: {expected_domain}" - raise ValueError(msg) + next_url = urljoin(self.url, api_url) + host_name = get_host_name(next_url) + next_scraper = SCRAPERS[host_name] + if not isinstance(self, next_scraper): + raise ValueError(f"Attempted to scrape using {next_scraper} from {scraper_name}") except Exception: raise RecipeScrapersExceptions(f"Unexpected API URL: {api_url}") From b06eec99ced52b5f8cdbfa913c4e24215dccb7f9 Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 30 Apr 2024 13:18:33 +0100 Subject: [PATCH 10/15] MarleySpoon: exception handling: include link from raised-exception to originating-exception. --- recipe_scrapers/marleyspoon.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipe_scrapers/marleyspoon.py b/recipe_scrapers/marleyspoon.py index e6afa1d83..d47d44802 100644 --- a/recipe_scrapers/marleyspoon.py +++ b/recipe_scrapers/marleyspoon.py @@ -73,8 +73,8 @@ def _get_json_params(self): next_scraper = SCRAPERS[host_name] if not isinstance(self, next_scraper): raise ValueError(f"Attempted to scrape using {next_scraper} from {scraper_name}") - except Exception: - raise RecipeScrapersExceptions(f"Unexpected API URL: {api_url}") + except Exception as e: + raise RecipeScrapersExceptions(f"Unexpected API URL: {api_url}") from e return api_url, api_token From 9c94ee9e44c2a434a8d52e81a7b1d83ee7c8d9b9 Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 30 Apr 2024 13:22:57 +0100 Subject: [PATCH 11/15] MarleySpoon: fixup: add missing SCRAPERS import (localised; not ideal, but avoids a circular import). --- recipe_scrapers/marleyspoon.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/recipe_scrapers/marleyspoon.py b/recipe_scrapers/marleyspoon.py index d47d44802..1d9b3898f 100644 --- a/recipe_scrapers/marleyspoon.py +++ b/recipe_scrapers/marleyspoon.py @@ -66,6 +66,8 @@ def _get_json_params(self): if api_url is None or api_token is None: raise ElementNotFoundInHtml("Required script not found.") + from . import SCRAPERS + scraper_name = self.__class__.__name__ try: next_url = urljoin(self.url, api_url) From 7561de5ac6405987be2b88784524c1e4bcbc206e Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 30 Apr 2024 13:29:03 +0100 Subject: [PATCH 12/15] MarleySpoon: reduce constraint: allow less-precise matches on partial host domain name. --- recipe_scrapers/marleyspoon.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/recipe_scrapers/marleyspoon.py b/recipe_scrapers/marleyspoon.py index 1d9b3898f..35ce9873e 100644 --- a/recipe_scrapers/marleyspoon.py +++ b/recipe_scrapers/marleyspoon.py @@ -72,7 +72,13 @@ def _get_json_params(self): try: next_url = urljoin(self.url, api_url) host_name = get_host_name(next_url) - next_scraper = SCRAPERS[host_name] + next_scraper = type(None) + # check: api.foo.xx.example, foo.xx.example, xx.example + while host_name and host_name.count('.'): + next_scraper = SCRAPERS.get(host_name) + if next_scraper: + break + host_name = '.'.join(host_name.split('.')[1:]) if not isinstance(self, next_scraper): raise ValueError(f"Attempted to scrape using {next_scraper} from {scraper_name}") except Exception as e: From 2a5e003c5abb8111b314bf1f1ba45cdad30a81f1 Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 30 Apr 2024 13:32:18 +0100 Subject: [PATCH 13/15] MarleySpoon: linting: adjust code to comply with black code style recommendations / requirements. --- recipe_scrapers/marleyspoon.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/recipe_scrapers/marleyspoon.py b/recipe_scrapers/marleyspoon.py index 35ce9873e..8954cb20e 100644 --- a/recipe_scrapers/marleyspoon.py +++ b/recipe_scrapers/marleyspoon.py @@ -74,13 +74,14 @@ def _get_json_params(self): host_name = get_host_name(next_url) next_scraper = type(None) # check: api.foo.xx.example, foo.xx.example, xx.example - while host_name and host_name.count('.'): + while host_name and host_name.count("."): next_scraper = SCRAPERS.get(host_name) if next_scraper: break - host_name = '.'.join(host_name.split('.')[1:]) + host_name = ".".join(host_name.split(".")[1:]) if not isinstance(self, next_scraper): - raise ValueError(f"Attempted to scrape using {next_scraper} from {scraper_name}") + msg = f"Attempted to scrape using {next_scraper} from {scraper_name}" + raise ValueError(msg) except Exception as e: raise RecipeScrapersExceptions(f"Unexpected API URL: {api_url}") from e From ff02a0c0899fb2c69321f73a900d6f54fa093d28 Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 30 Apr 2024 13:34:13 +0100 Subject: [PATCH 14/15] MarleySpoon: refactor: adjust domain-climbing logic. --- recipe_scrapers/marleyspoon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipe_scrapers/marleyspoon.py b/recipe_scrapers/marleyspoon.py index 8954cb20e..8c05100de 100644 --- a/recipe_scrapers/marleyspoon.py +++ b/recipe_scrapers/marleyspoon.py @@ -78,7 +78,7 @@ def _get_json_params(self): next_scraper = SCRAPERS.get(host_name) if next_scraper: break - host_name = ".".join(host_name.split(".")[1:]) + _, host_name = host_name.split(".", 1) if not isinstance(self, next_scraper): msg = f"Attempted to scrape using {next_scraper} from {scraper_name}" raise ValueError(msg) From 1dfd79b23bb7b8baf424145f70ffe9fcc2c681d0 Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 30 Apr 2024 13:37:24 +0100 Subject: [PATCH 15/15] MarleySpoon: cleanup: remove unused import. --- recipe_scrapers/marleyspoon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipe_scrapers/marleyspoon.py b/recipe_scrapers/marleyspoon.py index 8c05100de..d0f44ca14 100644 --- a/recipe_scrapers/marleyspoon.py +++ b/recipe_scrapers/marleyspoon.py @@ -1,7 +1,7 @@ # mypy: disallow_untyped_defs=False import json import re -from urllib.parse import urljoin, urlsplit +from urllib.parse import urljoin import requests