from __future__ import absolute_import, division, unicode_literals try: import json except ImportError: import simplejson as json from html5lib import html5parser, sanitizer, constants, treebuilders def toxmlFactory(): tree = treebuilders.getTreeBuilder("etree") def toxml(element): # encode/decode roundtrip required for Python 2.6 compatibility result_bytes = tree.implementation.tostring(element, encoding="utf-8") return result_bytes.decode("utf-8") return toxml def runSanitizerTest(name, expected, input, toxml=None): if toxml is None: toxml = toxmlFactory() expected = ''.join([toxml(token) for token in html5parser.HTMLParser(). parseFragment(expected)]) expected = json.loads(json.dumps(expected)) assert expected == sanitize_html(input) def sanitize_html(stream, toxml=None): if toxml is None: toxml = toxmlFactory() return ''.join([toxml(token) for token in html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer). parseFragment(stream)]) def test_should_handle_astral_plane_characters(): assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml">\U0001d4b5 \U0001d538</html:p>' == sanitize_html("<p>𝒵 𝔸</p>") def test_should_allow_relative_uris(): assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml"><html:a href="/example.com" /></html:p>' == sanitize_html('<p><a href="/example.com"></a></p>') def test_sanitizer(): toxml = toxmlFactory() for tag_name in sanitizer.HTMLSanitizer.allowed_elements: if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']: continue # TODO if tag_name != tag_name.lower(): continue # TODO if tag_name == 'image': yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, "<img title=\"1\"/>foo <bad>bar</bad> baz", "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), toxml) elif tag_name == 'br': yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, "<br title=\"1\"/>foo <bad>bar</bad> baz<br/>", "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), toxml) elif tag_name in constants.voidElements: yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, "<%s title=\"1\"/>foo <bad>bar</bad> baz" % tag_name, "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), toxml) else: yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, "<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), toxml) for tag_name in sanitizer.HTMLSanitizer.allowed_elements: tag_name = tag_name.upper() yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name, "<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), toxml) for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: if attribute_name != attribute_name.lower(): continue # TODO if attribute_name == 'style': continue attribute_value = 'foo' if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri: attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0] yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name, "<p %s=\"%s\">foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value), "<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value), toxml) for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: attribute_name = attribute_name.upper() yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name, "<p>foo <bad>bar</bad> baz</p>", "<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name, toxml) for protocol in sanitizer.HTMLSanitizer.allowed_protocols: rest_of_uri = '//sub.domain.tld/path/object.ext' if protocol == 'data': rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri), """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri), toxml) yield (runSanitizerTest, "test_invalid_data_uri", "<audio controls=\"\"></audio>", "<audio controls=\"\" src=\"data:foobar\"></audio>", toxml) yield (runSanitizerTest, "test_data_uri_disallowed_type", "<audio controls=\"\"></audio>", "<audio controls=\"\" src=\"data:text/html,<html>\"></audio>", toxml) for protocol in sanitizer.HTMLSanitizer.allowed_protocols: rest_of_uri = '//sub.domain.tld/path/object.ext' if protocol == 'data': rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' protocol = protocol.upper() yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri), """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri), toxml)