# -*- coding: utf-8 -*-
"""
Copyright (c) 2011 Jan Pomikalek
This software is licensed as described in the file LICENSE.rst.
"""
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
import re
import lxml.html
import lxml.sax
try:
from functools import lru_cache
except ImportError:
from backports.functools_lru_cache import lru_cache
from lxml.html.clean import Cleaner
from xml.sax.handler import ContentHandler
from .paragraph import Paragraph
from ._compat import unicode, ignored
from .utils import is_blank
MAX_LINK_DENSITY_DEFAULT = 0.2
LENGTH_LOW_DEFAULT = 70
LENGTH_HIGH_DEFAULT = 200
STOPWORDS_LOW_DEFAULT = 0.30
STOPWORDS_HIGH_DEFAULT = 0.32
NO_HEADINGS_DEFAULT = False
# Short and near-good headings within MAX_HEADING_DISTANCE characters before
# a good paragraph are classified as good unless --no-headings is specified.
MAX_HEADING_DISTANCE_DEFAULT = 200
PARAGRAPH_TAGS = frozenset({
'body', 'blockquote', 'caption', 'center', 'col', 'colgroup', 'dd',
'div', 'dl', 'dt', 'fieldset', 'form', 'legend', 'optgroup', 'option',
'p', 'pre', 'table', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr',
'ul', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
})
DEFAULT_ENCODING = 'utf8'
DEFAULT_ENC_ERRORS = 'replace'
CHARSET_META_TAG_PATTERN = re.compile(br"""]+charset=["']?([^'"/>\s]+)""", re.IGNORECASE)
GOOD_OR_BAD = {'good', 'bad'}
class JustextError(Exception):
"Base class for jusText exceptions."
class JustextInvalidOptions(JustextError):
pass
def html_to_dom(html, default_encoding=DEFAULT_ENCODING, encoding=None, errors=DEFAULT_ENC_ERRORS):
"""Converts HTML to DOM."""
if isinstance(html, unicode):
decoded_html = html
# encode HTML for case it's XML with encoding declaration
forced_encoding = encoding if encoding else default_encoding
html = html.encode(forced_encoding, errors)
else:
decoded_html = decode_html(html, default_encoding, encoding, errors)
try:
dom = lxml.html.fromstring(decoded_html, parser=lxml.html.HTMLParser())
except ValueError:
# Unicode strings with encoding declaration are not supported.
# for XHTML files with encoding declaration, use the declared encoding
dom = lxml.html.fromstring(html, parser=lxml.html.HTMLParser())
return dom
def decode_html(html, default_encoding=DEFAULT_ENCODING, encoding=None, errors=DEFAULT_ENC_ERRORS):
"""
Converts a `html` containing an HTML page into Unicode.
Tries to guess character encoding from meta tag.
"""
if isinstance(html, unicode):
return html
if encoding:
return html.decode(encoding, errors)
match = CHARSET_META_TAG_PATTERN.search(html)
if match:
declared_encoding = match.group(1).decode("ASCII")
# proceed unknown encoding as if it wasn't found at all
with ignored(LookupError):
return html.decode(declared_encoding, errors)
# unknown encoding
try:
# try UTF-8 first
return html.decode("utf8")
except UnicodeDecodeError:
# try lucky with default encoding
try:
return html.decode(default_encoding, errors)
except UnicodeDecodeError as e:
raise JustextError("Unable to decode the HTML to Unicode: " + unicode(e))
def preprocessor(dom):
"Removes unwanted parts of DOM."
options = {
"processing_instructions": False,
"remove_unknown_tags": False,
"safe_attrs_only": False,
"page_structure": False,
"annoying_tags": False,
"frames": False,
"meta": False,
"links": False,
"javascript": False,
"scripts": True,
"comments": True,
"style": True,
"embedded": True,
"forms": True,
"kill_tags": ("head",),
}
cleaner = Cleaner(**options)
return cleaner.clean_html(dom)
# super(...).__init__() breaks Python 2.7 - TypeError: super() argument 1 must be type, not classobj
# noinspection PyMissingConstructor
class ParagraphMaker(ContentHandler):
"""
A class for converting a HTML page represented as a DOM object into a list
of paragraphs.
"""
@classmethod
def make_paragraphs(cls, root):
"""Converts DOM into paragraphs."""
handler = cls()
lxml.sax.saxify(root, handler)
return handler.paragraphs
def __init__(self):
self.path = PathInfo()
self.paragraphs = []
self.paragraph = None
self.link = False
self.br = False
self._start_new_pragraph()
def _start_new_pragraph(self):
if self.paragraph and self.paragraph.contains_text():
self.paragraphs.append(self.paragraph)
self.paragraph = Paragraph(self.path)
def startElementNS(self, name, qname, attrs):
name = name[1]
self.path.append(name)
if name in PARAGRAPH_TAGS or (name == "br" and self.br):
if name == "br":
# the
is a paragraph separator and should
# not be included in the number of tags within the
# paragraph
self.paragraph.tags_count -= 1
self._start_new_pragraph()
else:
self.br = bool(name == "br")
if self.br:
self.paragraph.append_text(' ')
elif name == 'a':
self.link = True
self.paragraph.tags_count += 1
def endElementNS(self, name, qname):
name = name[1]
self.path.pop()
if name in PARAGRAPH_TAGS:
self._start_new_pragraph()
if name == 'a':
self.link = False
def endDocument(self):
self._start_new_pragraph()
def characters(self, content):
if is_blank(content):
return
text = self.paragraph.append_text(content)
if self.link:
self.paragraph.chars_count_in_links += len(text)
self.br = False
class PathInfo(object):
def __init__(self):
# list of triples (tag name, order, children)
self._elements = []
@property
def dom(self):
return ".".join(e[0] for e in self._elements)
@property
def xpath(self):
return "/" + "/".join("%s[%d]" % e[:2] for e in self._elements)
def append(self, tag_name):
children = self._get_children()
order = children.get(tag_name, 0) + 1
children[tag_name] = order
xpath_part = (tag_name, order, {})
self._elements.append(xpath_part)
return self
def _get_children(self):
if not self._elements:
return {}
return self._elements[-1][2]
def pop(self):
self._elements.pop()
return self
@lru_cache(maxsize=128) # 100 stoplists
def define_stoplist(stoplist):
"Lower-case all words in stoplist and create frozen set."
stoplist = frozenset(w.lower() for w in stoplist)
return stoplist
def classify_paragraphs(paragraphs, stoplist, length_low=LENGTH_LOW_DEFAULT,
length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
no_headings=NO_HEADINGS_DEFAULT):
"Context-free paragraph classification."
stoplist = define_stoplist(stoplist)
for paragraph in paragraphs:
length = len(paragraph)
stopword_density = paragraph.stopwords_density(stoplist)
link_density = paragraph.links_density()
paragraph.heading = bool(not no_headings and paragraph.is_heading)
if link_density > max_link_density:
paragraph.cf_class = 'bad'
elif ('\xa9' in paragraph.text) or ('©' in paragraph.text):
paragraph.cf_class = 'bad'
elif 'select' in paragraph.dom_path:
paragraph.cf_class = 'bad'
elif length < length_low:
if paragraph.chars_count_in_links > 0:
paragraph.cf_class = 'bad'
else:
paragraph.cf_class = 'short'
elif stopword_density >= stopwords_high:
if length > length_high:
paragraph.cf_class = 'good'
else:
paragraph.cf_class = 'neargood'
elif stopword_density >= stopwords_low:
paragraph.cf_class = 'neargood'
else:
paragraph.cf_class = 'bad'
def _get_neighbour(i, paragraphs, ignore_neargood, inc, boundary):
while i + inc != boundary:
i += inc
c = paragraphs[i].class_type
if c in GOOD_OR_BAD:
return c
if c == 'neargood' and not ignore_neargood:
return c
return 'bad'
def get_prev_neighbour(i, paragraphs, ignore_neargood):
"""
Return the class of the paragraph at the top end of the short/neargood
paragraphs block. If ignore_neargood is True, than only 'bad' or 'good'
can be returned, otherwise 'neargood' can be returned, too.
"""
return _get_neighbour(i, paragraphs, ignore_neargood, -1, -1)
def get_next_neighbour(i, paragraphs, ignore_neargood):
"""
Return the class of the paragraph at the bottom end of the short/neargood
paragraphs block. If ignore_neargood is True, than only 'bad' or 'good'
can be returned, otherwise 'neargood' can be returned, too.
"""
return _get_neighbour(i, paragraphs, ignore_neargood, 1, len(paragraphs))
def revise_paragraph_classification(paragraphs, max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT):
"""
Context-sensitive paragraph classification. Assumes that classify_pragraphs
has already been called.
"""
# good headings
for i, paragraph in enumerate(paragraphs):
# copy classes
paragraph.class_type = paragraph.cf_class
if not (paragraph.heading and paragraph.class_type == 'short'):
continue
j = i + 1
distance = 0
while j < len(paragraphs) and distance <= max_heading_distance:
if paragraphs[j].class_type == 'good':
paragraph.class_type = 'neargood'
break
distance += len(paragraphs[j].text)
j += 1
# classify short
new_classes = {}
for i, paragraph in enumerate(paragraphs):
if paragraph.class_type != 'short':
continue
prev_neighbour = get_prev_neighbour(i, paragraphs, ignore_neargood=True)
next_neighbour = get_next_neighbour(i, paragraphs, ignore_neargood=True)
if prev_neighbour == 'good' and next_neighbour == 'good':
new_classes[i] = 'good'
elif prev_neighbour == 'bad' and next_neighbour == 'bad':
new_classes[i] = 'bad'
# it must be set(['good', 'bad'])
elif (prev_neighbour == 'bad' and get_prev_neighbour(i, paragraphs, ignore_neargood=False) == 'neargood') or \
(next_neighbour == 'bad' and get_next_neighbour(i, paragraphs, ignore_neargood=False) == 'neargood'):
new_classes[i] = 'good'
else:
new_classes[i] = 'bad'
for i, c in new_classes.items():
paragraphs[i].class_type = c
# revise neargood
for i, paragraph in enumerate(paragraphs):
if paragraph.class_type != 'neargood':
continue
prev_neighbour = get_prev_neighbour(i, paragraphs, ignore_neargood=True)
next_neighbour = get_next_neighbour(i, paragraphs, ignore_neargood=True)
if (prev_neighbour, next_neighbour) == ('bad', 'bad'):
paragraph.class_type = 'bad'
else:
paragraph.class_type = 'good'
# more good headings
for i, paragraph in enumerate(paragraphs):
if not (paragraph.heading and paragraph.class_type == 'bad' and paragraph.cf_class != 'bad'):
continue
j = i + 1
distance = 0
while j < len(paragraphs) and distance <= max_heading_distance:
if paragraphs[j].class_type == 'good':
paragraph.class_type = 'good'
break
distance += len(paragraphs[j].text)
j += 1
def justext(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT,
encoding=None, default_encoding=DEFAULT_ENCODING,
enc_errors=DEFAULT_ENC_ERRORS, preprocessor=preprocessor):
"""
Converts an HTML page into a list of classified paragraphs. Each paragraph
is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
"""
dom = html_to_dom(html_text, default_encoding, encoding, enc_errors)
dom = preprocessor(dom)
paragraphs = ParagraphMaker.make_paragraphs(dom)
classify_paragraphs(paragraphs, stoplist, length_low, length_high,
stopwords_low, stopwords_high, max_link_density, no_headings)
revise_paragraph_classification(paragraphs, max_heading_distance)
return paragraphs