diff --git a/extruct/jsonld.py b/extruct/jsonld.py index 75b04a87..75b2c774 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -8,6 +8,7 @@ import jstyleson import lxml.etree +import logging from extruct.utils import parse_html @@ -28,14 +29,25 @@ def extract_items(self, document, base_url=None): if items for item in items if item ] + def _may_be_get_json(self, script): + try: + return json.loads(script, strict=False) + except Exception: + return None + def _extract_items(self, node): script = node.xpath('string()') - try: - # TODO: `strict=False` can be configurable if needed - data = json.loads(script, strict=False) - except ValueError: + data = self._may_be_get_json(script) + # check if valid json. + if not data: # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments - data = jstyleson.loads(HTML_OR_JS_COMMENTLINE.sub('', script), strict=False) + script = jstyleson.dispose(HTML_OR_JS_COMMENTLINE.sub('', script)) + data = self._may_be_get_json(script) + # After processing check if json is still valid. + if not data: + logging.exception('Invalid jsonld element detected %s', script) + return + if isinstance(data, list): for item in data: yield item diff --git a/tests/samples/custom.invalid/JSONLD_valid_and_invalid.html b/tests/samples/custom.invalid/JSONLD_valid_and_invalid.html new file mode 100644 index 00000000..9efd3470 --- /dev/null +++ b/tests/samples/custom.invalid/JSONLD_valid_and_invalid.html @@ -0,0 +1,16 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/samples/custom.invalid/JSONLD_valid_and_invalid.jsonld b/tests/samples/custom.invalid/JSONLD_valid_and_invalid.jsonld new file mode 100644 index 00000000..e17b54c6 --- /dev/null +++ b/tests/samples/custom.invalid/JSONLD_valid_and_invalid.jsonld @@ -0,0 +1 @@ +[ {"foo" : "bar"} ] \ No newline at end of file diff --git a/tests/test_jsonld.py b/tests/test_jsonld.py index 6edc2877..4d48d416 100644 --- a/tests/test_jsonld.py +++ b/tests/test_jsonld.py @@ -18,6 +18,12 @@ def test_songkick(self): 'Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015' ) + def test_when_page_has_invalid_jsonld_elements_should_skip(self): + self.assertJsonLdCorrect( + folder='custom.invalid', + page='JSONLD_valid_and_invalid' + ) + def test_jsonld_empty_item(self): self.assertJsonLdCorrect( folder='songkick',