diff --git a/mf2py/dom_helpers.py b/mf2py/dom_helpers.py index 8b10f43..9b3837f 100644 --- a/mf2py/dom_helpers.py +++ b/mf2py/dom_helpers.py @@ -4,7 +4,7 @@ import bs4 import re -from bs4.element import Tag, NavigableString, Comment +from bs4.element import Tag, NavigableString, Comment, MinimalHTMLFormatter if sys.version < '3': from urlparse import urljoin @@ -21,6 +21,11 @@ _reduce_spaces_regex = re.compile(r" {2,}") +class MinimalHTML5Formatter(MinimalHTMLFormatter): + """An HTML formatter that omits the slash in void tags and othewise does minimal replacement""" + void_element_close_prefix = None + + def try_urljoin(base, url, allow_fragments=True): """attempts urljoin, on ValueError passes through url. Shortcuts http(s):// urls""" if url.startswith(("https://", "http://")): diff --git a/mf2py/parse_property.py b/mf2py/parse_property.py index 2d72259..2e151d9 100644 --- a/mf2py/parse_property.py +++ b/mf2py/parse_property.py @@ -1,7 +1,7 @@ """functions to parse the properties of elements""" from __future__ import unicode_literals, print_function -from .dom_helpers import get_attr, get_img_src_alt, get_textContent, try_urljoin +from .dom_helpers import get_attr, get_img_src_alt, get_textContent, try_urljoin, MinimalHTML5Formatter from .datetime_helpers import normalize_datetime, DATETIME_RE, TIME_RE from . import value_class_pattern @@ -104,6 +104,6 @@ def datetime(el, default_date=None): def embedded(el, base_url=''): """Process e-* properties""" return { - 'html': el.decode_contents().strip(), # secret bs4 method to get innerHTML + 'html': el.decode_contents(formatter=MinimalHTML5Formatter()).strip(), # secret bs4 method to get innerHTML 'value': get_textContent(el, replace_img=True, base_url=base_url) } diff --git a/setup.py b/setup.py index ac6277a..bad50b3 100644 --- a/setup.py +++ b/setup.py @@ -20,8 +20,8 @@ install_requires=[ # Keep in sync with requirements.txt! 'html5lib>=1.0.1', - 'requests>=2.18.4', - 'BeautifulSoup4>=4.6.0', + 'requests>=2.19.1', + 'BeautifulSoup4>=4.6.3', ], tests_require=[ 'lxml',