# This file is part of EbookLib. # Copyright (c) 2013 Aleksandar Erkalovic # # EbookLib is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # EbookLib is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with EbookLib. If not, see . import six from ebooklib.plugins.base import BasePlugin from ebooklib.utils import parse_html_string # TODO: # - should also look for the _required_ elements # http://www.w3.org/html/wg/drafts/html/master/tabular-data.html#the-table-element ATTRIBUTES_GLOBAL = [ "accesskey", "class", "contenteditable", "contextmenu", "dir", "draggable", "dropzone", "hidden", "id", "inert", "itemid", "itemprop", "itemref", "itemscope", "itemtype", "lang", "spellcheck", "style", "tabindex", "title", "translate", "epub:type", ] # Remove for now from here DEPRECATED_TAGS = [ "acronym", "applet", "basefont", "big", "center", "dir", "font", "frame", "frameset", "isindex", "noframes", "s", "strike", "tt", ] def leave_only(item, tag_list): for _attr in six.iterkeys(item.attrib): if _attr not in tag_list: del item.attrib[_attr] class SyntaxPlugin(BasePlugin): NAME = "Check HTML syntax" def html_before_write(self, book, chapter): from lxml import etree try: tree = parse_html_string(chapter.content) except Exception: return root = tree.getroottree() # delete deprecated tags # i should really have a list of allowed tags for tag in DEPRECATED_TAGS: etree.strip_tags(root, tag) head = tree.find("head") if head is not None and len(head) != 0: for _item in head: if _item.tag == "base": leave_only(_item, ATTRIBUTES_GLOBAL + ["href", "target"]) elif _item.tag == "link": leave_only( _item, ATTRIBUTES_GLOBAL + ["href", "crossorigin", "rel", "media", "hreflang", "type", "sizes"] ) elif _item.tag == "title": if _item.text == "": head.remove(_item) elif _item.tag == "meta": leave_only(_item, ATTRIBUTES_GLOBAL + ["name", "http-equiv", "content", "charset"]) # just remove for now, but really should not be like this head.remove(_item) elif _item.tag == "script": leave_only(_item, ATTRIBUTES_GLOBAL + ["src", "type", "charset", "async", "defer", "crossorigin"]) elif _item.tag == "source": leave_only(_item, ATTRIBUTES_GLOBAL + ["src", "type", "media"]) elif _item.tag == "style": leave_only(_item, ATTRIBUTES_GLOBAL + ["media", "type", "scoped"]) else: leave_only(_item, ATTRIBUTES_GLOBAL) if len(root.find("body")) != 0: body = tree.find("body") for _item in body.iter(): # it is not # if _item.tag == "a": leave_only(_item, ATTRIBUTES_GLOBAL + ["href", "target", "download", "rel", "hreflang", "type"]) elif _item.tag == "area": leave_only( _item, ATTRIBUTES_GLOBAL + ["alt", "coords", "shape", "href", "target", "download", "rel", "hreflang", "type"], ) elif _item.tag == "audio": leave_only( _item, ATTRIBUTES_GLOBAL + ["src", "crossorigin", "preload", "autoplay", "mediagroup", "loop", "muted", "controls"], ) elif _item.tag == "blockquote": leave_only(_item, ATTRIBUTES_GLOBAL + ["cite"]) elif _item.tag == "button": leave_only( _item, ATTRIBUTES_GLOBAL + [ "autofocus", "disabled", "form", "formaction", "formenctype", "formmethod", "formnovalidate", "formtarget", "name", "type", "value", "menu", ], ) elif _item.tag == "canvas": leave_only(_item, ATTRIBUTES_GLOBAL + ["width", "height"]) elif _item.tag == "canvas": leave_only(_item, ATTRIBUTES_GLOBAL + ["width", "height"]) elif _item.tag == "del": leave_only(_item, ATTRIBUTES_GLOBAL + ["cite", "datetime"]) elif _item.tag == "details": leave_only(_item, ATTRIBUTES_GLOBAL + ["open"]) elif _item.tag == "embed": leave_only(_item, ATTRIBUTES_GLOBAL + ["src", "type", "width", "height"]) elif _item.tag == "fieldset": leave_only(_item, ATTRIBUTES_GLOBAL + ["disable", "form", "name"]) elif _item.tag == "details": leave_only( _item, ATTRIBUTES_GLOBAL + [ "accept-charset", "action", "autocomplete", "enctype", "method", "name", "novalidate", "target", ], ) elif _item.tag == "iframe": leave_only( _item, ATTRIBUTES_GLOBAL + ["src", "srcdoc", "name", "sandbox", "seamless", "allowfullscreen", "width", "height"], ) elif _item.tag == "img": _src = _item.get("src", "").lower() if _src.startswith("http://") or _src.startswith("https://"): if "remote-resources" not in chapter.properties: chapter.properties.append("remote-resources") # THIS DOES NOT WORK, ONLY VIDEO AND AUDIO FILES CAN BE REMOTE RESOURCES # THAT MEANS I SHOULD ALSO CATCH