# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
import codecs
import os
import re
import sys
from ._compat import escape, PY3, URLError, urllib
from .core import *
from .utils import get_stoplist, get_stoplists
def usage():
return """Usage: %(progname)s -s STOPLIST [OPTIONS] [HTML_FILE]
Convert HTML to plain text and remove boilerplate.
-o OUTPUT_FILE if not specified, output is written to stdout
--encoding=... default character encoding to be used if not specified
in the HTML meta tags (default: %(default_encoding)s)
--enc-force force specified encoding, ignore HTML meta tags
--enc-errors=... errors handling for character encoding conversion:
strict: fail on error
ignore: ignore characters which can't be converted
replace: replace characters which can't be converted
with U+FFFD unicode replacement characters
(default: %(default_enc_errors)s)
--format=... output format; possible values:
default: one paragraph per line, each preceded with
or (headings)
boilerplate: same as default, except for boilerplate
paragraphs are included, too, preceded
with
detailed: one paragraph per line, each preceded with
tag containing detailed information
about classification as attributes
krdwrd: KrdWrd compatible format
--no-headings disable special handling of headings
--list-stoplists print a list of inbuilt stoplists and exit
-V, --version print version information and exit
-h, --help display this help and exit
If no HTML_FILE specified, input is read from stdin.
STOPLIST must be one of the following:
- one of the inbuilt stoplists; see:
%(progname)s --list-stoplists
- path to a file with the most frequent words for given language,
one per line, in UTF-8 encoding
- None - this activates a language-independent mode
Advanced options:
--length-low=INT (default %(length_low)i)
--length-high=INT (default %(length_high)i)
--stopwords-low=FLOAT (default %(stopwords_low)f)
--stopwords-high=FLOAT (default %(stopwords_high)f)
--max-link-density=FLOAT (default %(max_link_density)f)
--max-heading-distance=INT (default %(max_heading_distance)i)
""" % {
'progname': os.path.basename(os.path.basename(sys.argv[0])),
'length_low': LENGTH_LOW_DEFAULT,
'length_high': LENGTH_HIGH_DEFAULT,
'stopwords_low': STOPWORDS_LOW_DEFAULT,
'stopwords_high': STOPWORDS_HIGH_DEFAULT,
'max_link_density': MAX_LINK_DENSITY_DEFAULT,
'max_heading_distance': MAX_HEADING_DISTANCE_DEFAULT,
'default_encoding': DEFAULT_ENCODING,
'default_enc_errors': DEFAULT_ENC_ERRORS,
}
def output_default(paragraphs, fp=sys.stdout, no_boilerplate=True):
"""
Outputs the paragraphs as:
text of the first paragraph
text of the second paragraph
...
where is , or which indicates
standard paragraph, heading or boilerplate respecitvely.
"""
for paragraph in paragraphs:
if paragraph.class_type == 'good':
if paragraph.heading:
tag = 'h'
else:
tag = 'p'
elif no_boilerplate:
continue
else:
tag = 'b'
print('<%s> %s' % (tag, escape(paragraph.text, quote=False)), file=fp)
def output_detailed(paragraphs, fp=sys.stdout):
"""
Same as output_default, but only tags are used and the following
attributes are added: class, cfclass and heading.
"""
for paragraph in paragraphs:
output = '
%s' % (
paragraph.class_type,
paragraph.cf_class,
int(paragraph.heading),
paragraph.xpath,
escape(paragraph.text, quote=False)
)
print(output, file=fp)
def output_krdwrd(paragraphs, fp=sys.stdout):
"""
Outputs the paragraphs in a KrdWrd compatible format:
classfirst text node
classsecond text node
...
where class is 1, 2 or 3 which means
boilerplate, undecided or good respectively. Headings are output as
undecided.
"""
for paragraph in paragraphs:
if paragraph.class_type in ('good', 'neargood'):
if paragraph.heading:
cls = 2
else:
cls = 3
else:
cls = 1
for text_node in paragraph.text_nodes:
print('%i\t%s' % (cls, text_node.strip()), file=fp)
def main():
import getopt
from justext import __version__ as VERSION
try:
opts, args = getopt.getopt(sys.argv[1:], "o:s:hV", ["encoding=",
"enc-force", "enc-errors=", "format=",
"no-headings", "help", "version", "length-low=", "length-high=",
"stopwords-low=", "stopwords-high=", "max-link-density=",
"max-heading-distance=", "list-stoplists"])
except getopt.GetoptError as e:
print(e, file=sys.stderr)
print(usage(), file=sys.stderr)
sys.exit(1)
stream_writer = codecs.lookup('utf8')[-1]
fp_in = sys.stdin
if PY3:
fp_out = stream_writer(sys.stdout.buffer)
else:
fp_out = stream_writer(sys.stdout)
stoplist = None
format = 'default'
no_headings = False
length_low = LENGTH_LOW_DEFAULT
length_high = LENGTH_HIGH_DEFAULT
stopwords_low = STOPWORDS_LOW_DEFAULT
stopwords_high = STOPWORDS_HIGH_DEFAULT
max_link_density = MAX_LINK_DENSITY_DEFAULT
max_heading_distance = MAX_HEADING_DISTANCE_DEFAULT
encoding = None
default_encoding = DEFAULT_ENCODING
force_default_encoding = False
enc_errors = DEFAULT_ENC_ERRORS
try:
for o, a in opts:
if o in ("-h", "--help"):
print(usage())
sys.exit(0)
if o in ("-V", "--version"):
print("%s: jusText v%s\n\nCopyright (c) 2011 Jan Pomikalek " % (
os.path.basename(sys.argv[0]), VERSION))
sys.exit(0)
elif o == "--list-stoplists":
print("\n".join(sorted(get_stoplists())))
sys.exit(0)
elif o == "-o":
try:
fp_out = codecs.open(a, 'w', 'utf8')
except IOError as e:
raise JustextInvalidOptions(
"Can't open %s for writing: %s" % (a, e))
elif o == "-s":
if a.lower() == 'none':
stoplist = set()
else:
if os.path.isfile(a):
try:
fp_stoplist = codecs.open(a, 'r', 'utf8')
stoplist = set([l.strip() for l in fp_stoplist])
fp_stoplist.close()
except IOError as e:
raise JustextInvalidOptions(
"Can't open %s for reading: %s" % (a, e))
except UnicodeDecodeError as e:
raise JustextInvalidOptions(
"Unicode decoding error when reading "
"the stoplist (probably not in UTF-8): %s" % e)
elif a in get_stoplists():
stoplist = get_stoplist(a)
else:
if re.match(r'^\w*$', a):
# only alphabetical chars, probably misspelled or
# unsupported language
raise JustextInvalidOptions(
"Unknown stoplist: %s\nAvailable stoplists:\n%s" % (
a, '\n'.join(sorted(get_stoplists()))))
else:
# probably incorrectly specified path
raise JustextInvalidOptions("File not found: %s" % a)
elif o == "--encoding":
try:
default_encoding = a
''.encode(default_encoding)
except LookupError:
raise JustextInvalidOptions("Uknown character encoding: %s" % a)
elif o == "--enc-force":
force_default_encoding = True
elif o == "--enc-errors":
if a.lower() in ['strict', 'ignore', 'replace']:
enc_errors = a.lower()
else:
raise JustextInvalidOptions("Invalid --enc-errors value: %s" % a)
elif o == "--format":
if a in ['default', 'boilerplate', 'detailed', 'krdwrd']:
format = a
else:
raise JustextInvalidOptions("Uknown output format: %s" % a)
elif o == "--no-headings":
no_headings = True
elif o == "--length-low":
try:
length_low = int(a)
except ValueError:
raise JustextInvalidOptions(
"Invalid value for %s: '%s'. Integer expected." % (o, a))
elif o == "--length-high":
try:
length_high = int(a)
except ValueError:
raise JustextInvalidOptions(
"Invalid value for %s: '%s'. Integer expected." % (o, a))
elif o == "--stopwords-low":
try:
stopwords_low = float(a)
except ValueError:
raise JustextInvalidOptions(
"Invalid value for %s: '%s'. Float expected." % (o, a))
elif o == "--stopwords-high":
try:
stopwords_high = float(a)
except ValueError:
raise JustextInvalidOptions(
"Invalid value for %s: '%s'. Float expected." % (o, a))
elif o == "--max-link-density":
try:
max_link_density = float(a)
except ValueError:
raise JustextInvalidOptions(
"Invalid value for %s: '%s'. Float expected." % (o, a))
elif o == "--max-heading-distance":
try:
max_heading_distance = int(a)
except ValueError:
raise JustextInvalidOptions(
"Invalid value for %s: '%s'. Integer expected." % (o, a))
if force_default_encoding:
encoding = default_encoding
if stoplist is None:
raise JustextInvalidOptions("No stoplist specified.")
if not stoplist:
# empty stoplist, switch to language-independent mode
stopwords_high = 0
stopwords_low = 0
if args:
try:
if re.match(r"[^:/]+://", args[0]):
fp_in = urllib.urlopen(args[0])
else:
fp_in = open(args[0], 'r')
except (IOError, URLError) as e:
raise JustextInvalidOptions(
"Can't open %s for reading: %s" % (args[0], e))
html_text = fp_in.read()
if fp_in is not sys.stdin:
fp_in.close()
paragraphs = justext(html_text, stoplist, length_low, length_high,
stopwords_low, stopwords_high, max_link_density, max_heading_distance,
no_headings, encoding, default_encoding, enc_errors)
if format == "default":
output_default(paragraphs, fp_out)
elif format == "boilerplate":
output_default(paragraphs, fp_out, no_boilerplate=False)
elif format == "detailed":
output_detailed(paragraphs, fp_out)
elif format == "krdwrd":
output_krdwrd(paragraphs, fp_out)
else:
# this should not happen; format checked when parsing options
raise AssertionError("Unknown format: %s" % format)
except JustextError as e:
print("%s: %s" % (os.path.basename(sys.argv[0]), e), file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()