# cython: language_level=3str """A cleanup tool for HTML. Removes unwanted tags and content. See the `Cleaner` class for details. """ import copy import re from collections import deque from urllib.parse import urlsplit, unquote_plus import warnings from lxml import etree from lxml.html import defs from lxml.html import fromstring as lxml_fromstring, XHTML_NAMESPACE from lxml.html import xhtml_to_html, _transform_result __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 'word_break', 'word_break_html', 'LXMLHTMLCleanWarning', 'AmbiguousURLWarning'] # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl # Particularly the CSS cleaning; most of the tag cleaning is integrated now # I have multiple kinds of schemes searched; but should schemes be # whitelisted instead? # max height? # remove images? Also in CSS? background attribute? # Some way to whitelist object, iframe, etc (e.g., if you want to # allow *just* embedded YouTube movies) # Log what was deleted and why? # style="behavior: ..." might be bad in IE? # Should we have something for just ? That's the worst of the # metas. # UTF-7 detections? Example: # +ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- # you don't always have to have the charset set, if the page has no charset # and there's UTF7-like code in it. # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php # This is an IE-specific construct you can have in a stylesheet to # run some Javascript: _replace_css_javascript = re.compile( r'expression\s*\(.*?\)', re.S|re.I).sub # Do I have to worry about @\nimport? _replace_css_import = re.compile( r'@\s*import', re.I).sub _looks_like_tag_content = re.compile( r' safe_image_urls _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx _conditional_comment_re = re.compile( r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) _find_styled_elements = etree.XPath( "descendant-or-self::*[@style]") _find_external_links = etree.XPath( ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), namespaces={'x':XHTML_NAMESPACE}) # Regex to remove all ASCII control characters (00-1F,7F) except: # - 09 - Horizontal tab # - 0A - Line Feed # - 0B - Vertical tab # - 0D - Carriage Return _ascii_control_characters_str = re.compile("[\x00-\x08\x0C\x0E-\x1F\x7F]") _ascii_control_characters_bytes = re.compile(b"[\x00-\x08\x0C\x0E-\x1F\x7F]") def fromstring(data): """ Enhanced fromstring function that removes ASCII control chars before passing the input to the original lxml.html.fromstring. """ if isinstance(data, bytes): return lxml_fromstring(_ascii_control_characters_bytes.sub(b"", data)) else: return lxml_fromstring(_ascii_control_characters_str.sub("", data)) # This regular expression is inspired by the one in urllib3. _URI_RE = re.compile( r"^(?:(?P[a-zA-Z][a-zA-Z0-9+.-]*[a-zA-Z0-9]):)?" r"(?://(?P[^\\/?#]*))?" r"(?P[^?#]*)" r"(?:\?(?P[^#]*))?" r"(?:#(?P.*))?$", re.UNICODE, ) def _get_authority_from_url(url): match = _URI_RE.match(url) if match: return match.group("authority") else: return None class LXMLHTMLCleanWarning(Warning): pass class AmbiguousURLWarning(LXMLHTMLCleanWarning): pass class Cleaner: """ Instances cleans the document of each of the possible offending elements. The cleaning is controlled by attributes; you can override attributes in a subclass, or set them in the constructor. ``scripts``: Removes any ``