# pylint:disable-msg=E0611,I1101 """Minimalistic fork of readability-lxml code This is a python port of a ruby port of arc90's readability project http://lab.arc90.com/experiments/readability/ Given a html document, it pulls out the main body text and cleans it up. Ruby port by starrhorne and iterationlabs Python port by gfxmonk For list of contributors see https://github.com/timbertson/python-readability https://github.com/buriy/python-readability License of forked code: Apache-2.0. """ import logging import re from math import sqrt from operator import attrgetter from typing import Any, Dict, Optional, Set from lxml.etree import tostring from lxml.html import HtmlElement, fragment_fromstring from .utils import load_html, trim LOGGER = logging.getLogger(__name__) DOT_SPACE = re.compile(r"\.( |$)") def _tostring(string: HtmlElement) -> str: return tostring(string, encoding=str, method="xml") DIV_TO_P_ELEMS = { "a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul", } DIV_SCORES = {"div", "article"} BLOCK_SCORES = {"pre", "td", "blockquote"} BAD_ELEM_SCORES = {"address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"} STRUCTURE_SCORES = {"h1", "h2", "h3", "h4", "h5", "h6", "th", "header", "footer", "nav"} TEXT_CLEAN_ELEMS = {"p", "img", "li", "a", "embed", "input"} REGEXES = { "unlikelyCandidatesRe": re.compile( r"combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter", re.I, ), "okMaybeItsACandidateRe": re.compile(r"and|article|body|column|main|shadow", re.I), "positiveRe": re.compile( r"article|body|content|entry|hentry|main|page|pagination|post|text|blog|story", re.I, ), "negativeRe": re.compile( r"button|combx|comment|com-|contact|figure|foot|footer|footnote|form|input|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget", re.I, ), "divToPElementsRe": re.compile( r"<(?:a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I ), "videoRe": re.compile(r"https?:\/\/(?:www\.)?(?:youtube|vimeo)\.com", re.I), } FRAME_TAGS = {"body", "html"} LIST_TAGS = {"ol", "ul"} # DIV_TO_P_ELEMS = {'a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre', 'table', 'ul'} def text_length(elem: HtmlElement) -> int: "Return the length of the element with all its contents." return len(trim(elem.text_content())) class Candidate: "Defines a class to score candidate elements." __slots__ = ["score", "elem"] def __init__(self, score: float, elem: HtmlElement) -> None: self.score: float = score self.elem: HtmlElement = elem class Document: """Class to build a etree document out of html.""" __slots__ = ["doc", "min_text_length", "retry_length"] def __init__(self, doc: HtmlElement, min_text_length: int = 25, retry_length: int = 250) -> None: """Generate the document :param doc: string of the html content. :param min_text_length: Set to a higher value for more precise detection of longer texts. :param retry_length: Set to a lower value for better detection of very small texts. The Document class is not re-enterable. It is designed to create a new Document() for each HTML file to process it. API method: .summary() -- cleaned up content """ self.doc = doc self.min_text_length = min_text_length self.retry_length = retry_length def summary(self) -> str: """ Given a HTML file, extracts the text of the article. Warning: It mutates internal DOM representation of the HTML document, so it is better to call other API methods before this one. """ for elem in self.doc.iter("script", "style"): elem.drop_tree() ruthless = True while True: if ruthless: self.remove_unlikely_candidates() self.transform_misused_divs_into_paragraphs() candidates = self.score_paragraphs() best_candidate = self.select_best_candidate(candidates) if best_candidate: article = self.get_article(candidates, best_candidate) else: if ruthless is True: ruthless = False LOGGER.debug( "Ended up stripping too much - going for a safer parse" ) # try again continue # go ahead LOGGER.debug( "Ruthless and lenient parsing did not work. Returning raw html" ) body = self.doc.find("body") article = body if body is not None else self.doc cleaned_article = self.sanitize(article, candidates) article_length = len(cleaned_article or "") if ruthless and article_length < self.retry_length: ruthless = False # Loop through and try again. continue return cleaned_article def get_article(self, candidates: Dict[HtmlElement, Candidate], best_candidate: Candidate) -> HtmlElement: # Now that we have the top candidate, look through its siblings for # content that might also be related. # Things like preambles, content split by ads that we removed, etc. sibling_score_threshold = max(10, best_candidate.score * 0.2) # create a new html document with a div output = fragment_fromstring("
") parent = best_candidate.elem.getparent() siblings = list(parent) if parent is not None else [best_candidate.elem] for sibling in siblings: # in lxml there no concept of simple text # if isinstance(sibling, NavigableString): continue append = False # conditions if sibling == best_candidate.elem or ( sibling in candidates and candidates[sibling].score >= sibling_score_threshold ): append = True elif sibling.tag == "p": link_density = self.get_link_density(sibling) node_content = sibling.text or "" node_length = len(node_content) if ( node_length > 80 and link_density < 0.25 or ( node_length <= 80 and link_density == 0 and DOT_SPACE.search(node_content) ) ): append = True # append to the output div if append: output.append(sibling) # if output is not None: # output.append(best_candidate.elem) return output def select_best_candidate(self, candidates: Dict[HtmlElement, Candidate]) -> Optional[Candidate]: if not candidates: return None sorted_candidates = sorted( candidates.values(), key=attrgetter("score"), reverse=True ) if LOGGER.isEnabledFor(logging.DEBUG): for candidate in sorted_candidates[:5]: LOGGER.debug("Top 5: %s %s", candidate.elem.tag, candidate.score) return next(iter(sorted_candidates)) def get_link_density(self, elem: HtmlElement) -> float: total_length = text_length(elem) or 1 link_length = sum(text_length(link) for link in elem.findall(".//a")) return link_length / total_length def score_paragraphs(self) -> Dict[HtmlElement, Candidate]: candidates = {} for elem in self.doc.iter("p", "pre", "td"): parent_node = elem.getparent() if parent_node is None: continue grand_parent_node = parent_node.getparent() elem_text = trim(elem.text_content()) elem_text_len = len(elem_text) # discard too short paragraphs if elem_text_len < self.min_text_length: continue for node in (parent_node, grand_parent_node): if node is not None and node not in candidates: candidates[node] = self.score_node(node) score = 1 + len(elem_text.split(",")) + min((elem_text_len / 100), 3) # if elem not in candidates: # candidates[elem] = self.score_node(elem) candidates[parent_node].score += score if grand_parent_node is not None: candidates[grand_parent_node].score += score / 2 # Scale the final candidates score based on link density. Good content # should have a relatively small link density (5% or less) and be # mostly unaffected by this operation. for elem, candidate in candidates.items(): candidate.score *= 1 - self.get_link_density(elem) return candidates def class_weight(self, elem: HtmlElement) -> float: weight = 0 for attribute in filter(None, (elem.get("class"), elem.get("id"))): if REGEXES["negativeRe"].search(attribute): weight -= 25 if REGEXES["positiveRe"].search(attribute): weight += 25 return weight def score_node(self, elem: HtmlElement) -> Candidate: score = self.class_weight(elem) tag = str(elem.tag) name = tag.lower() if name in DIV_SCORES: score += 5 elif name in BLOCK_SCORES: score += 3 elif name in BAD_ELEM_SCORES: score -= 3 elif name in STRUCTURE_SCORES: score -= 5 return Candidate(score, elem) def remove_unlikely_candidates(self) -> None: for elem in self.doc.findall(".//*"): attrs = " ".join(filter(None, (elem.get("class"), elem.get("id")))) if len(attrs) < 2: continue if ( elem.tag not in FRAME_TAGS and REGEXES["unlikelyCandidatesRe"].search(attrs) and (not REGEXES["okMaybeItsACandidateRe"].search(attrs)) ): # LOGGER.debug("Removing unlikely candidate: %s", elem.tag) elem.drop_tree() def transform_misused_divs_into_paragraphs(self) -> None: for elem in self.doc.findall(".//div"): # transforms
# FIXME: The current implementation ignores all descendants that
# are not direct children of elem
# This results in incorrect results in case there is an
# buried within an for example
# hurts precision:
# if not any(e.tag in DIV_TO_P_ELEMS for e in list(elem)):
if not REGEXES["divToPElementsRe"].search(
"".join(map(_tostring, list(elem)))
):
elem.tag = "p"
for elem in self.doc.findall(".//div"):
if elem.text and elem.text.strip():
p_elem = fragment_fromstring("")
p_elem.text, elem.text = elem.text, None
elem.insert(0, p_elem)
for pos, child in sorted(enumerate(elem), reverse=True):
if child.tail and child.tail.strip():
p_elem = fragment_fromstring("")
p_elem.text, child.tail = child.tail, None
elem.insert(pos + 1, p_elem)
if child.tag == "br":
child.drop_tree()
def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate]) -> str:
for header in node.iter("h1", "h2", "h3", "h4", "h5", "h6"):
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
header.drop_tree()
for elem in node.iter("form", "textarea"):
elem.drop_tree()
for elem in node.iter("iframe"):
if "src" in elem.attrib and REGEXES["videoRe"].search(elem.attrib["src"]):
elem.text = "VIDEO" # ADD content to iframe text node to force proper output
else:
elem.drop_tree()
allowed: Set[HtmlElement] = set()
# Conditionally clean