You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
307 lines
9.0 KiB
307 lines
9.0 KiB
4 years ago
|
import argparse
|
||
|
import sys
|
||
|
|
||
|
from . import HTML2Text, __version__, config
|
||
|
|
||
|
|
||
|
def main() -> None:
|
||
|
baseurl = ""
|
||
|
|
||
|
class bcolors:
|
||
|
HEADER = "\033[95m"
|
||
|
OKBLUE = "\033[94m"
|
||
|
OKGREEN = "\033[92m"
|
||
|
WARNING = "\033[93m"
|
||
|
FAIL = "\033[91m"
|
||
|
ENDC = "\033[0m"
|
||
|
BOLD = "\033[1m"
|
||
|
UNDERLINE = "\033[4m"
|
||
|
|
||
|
p = argparse.ArgumentParser()
|
||
|
p.add_argument(
|
||
|
"--default-image-alt",
|
||
|
dest="default_image_alt",
|
||
|
default=config.DEFAULT_IMAGE_ALT,
|
||
|
help="The default alt string for images with missing ones",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--pad-tables",
|
||
|
dest="pad_tables",
|
||
|
action="store_true",
|
||
|
default=config.PAD_TABLES,
|
||
|
help="pad the cells to equal column width in tables",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--no-wrap-links",
|
||
|
dest="wrap_links",
|
||
|
action="store_false",
|
||
|
default=config.WRAP_LINKS,
|
||
|
help="don't wrap links during conversion",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--wrap-list-items",
|
||
|
dest="wrap_list_items",
|
||
|
action="store_true",
|
||
|
default=config.WRAP_LIST_ITEMS,
|
||
|
help="wrap list items during conversion",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--ignore-emphasis",
|
||
|
dest="ignore_emphasis",
|
||
|
action="store_true",
|
||
|
default=config.IGNORE_EMPHASIS,
|
||
|
help="don't include any formatting for emphasis",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--reference-links",
|
||
|
dest="inline_links",
|
||
|
action="store_false",
|
||
|
default=config.INLINE_LINKS,
|
||
|
help="use reference style links instead of inline links",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--ignore-links",
|
||
|
dest="ignore_links",
|
||
|
action="store_true",
|
||
|
default=config.IGNORE_ANCHORS,
|
||
|
help="don't include any formatting for links",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--protect-links",
|
||
|
dest="protect_links",
|
||
|
action="store_true",
|
||
|
default=config.PROTECT_LINKS,
|
||
|
help="protect links from line breaks surrounding them with angle brackets",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--ignore-images",
|
||
|
dest="ignore_images",
|
||
|
action="store_true",
|
||
|
default=config.IGNORE_IMAGES,
|
||
|
help="don't include any formatting for images",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--images-as-html",
|
||
|
dest="images_as_html",
|
||
|
action="store_true",
|
||
|
default=config.IMAGES_AS_HTML,
|
||
|
help=(
|
||
|
"Always write image tags as raw html; preserves `height`, `width` and "
|
||
|
"`alt` if possible."
|
||
|
),
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--images-to-alt",
|
||
|
dest="images_to_alt",
|
||
|
action="store_true",
|
||
|
default=config.IMAGES_TO_ALT,
|
||
|
help="Discard image data, only keep alt text",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--images-with-size",
|
||
|
dest="images_with_size",
|
||
|
action="store_true",
|
||
|
default=config.IMAGES_WITH_SIZE,
|
||
|
help=(
|
||
|
"Write image tags with height and width attrs as raw html to retain "
|
||
|
"dimensions"
|
||
|
),
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"-g",
|
||
|
"--google-doc",
|
||
|
action="store_true",
|
||
|
dest="google_doc",
|
||
|
default=False,
|
||
|
help="convert an html-exported Google Document",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"-d",
|
||
|
"--dash-unordered-list",
|
||
|
action="store_true",
|
||
|
dest="ul_style_dash",
|
||
|
default=False,
|
||
|
help="use a dash rather than a star for unordered list items",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"-e",
|
||
|
"--asterisk-emphasis",
|
||
|
action="store_true",
|
||
|
dest="em_style_asterisk",
|
||
|
default=False,
|
||
|
help="use an asterisk rather than an underscore for emphasized text",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"-b",
|
||
|
"--body-width",
|
||
|
dest="body_width",
|
||
|
type=int,
|
||
|
default=config.BODY_WIDTH,
|
||
|
help="number of characters per output line, 0 for no wrap",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"-i",
|
||
|
"--google-list-indent",
|
||
|
dest="list_indent",
|
||
|
type=int,
|
||
|
default=config.GOOGLE_LIST_INDENT,
|
||
|
help="number of pixels Google indents nested lists",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"-s",
|
||
|
"--hide-strikethrough",
|
||
|
action="store_true",
|
||
|
dest="hide_strikethrough",
|
||
|
default=False,
|
||
|
help="hide strike-through text. only relevant when -g is " "specified as well",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--escape-all",
|
||
|
action="store_true",
|
||
|
dest="escape_snob",
|
||
|
default=False,
|
||
|
help=(
|
||
|
"Escape all special characters. Output is less readable, but avoids "
|
||
|
"corner case formatting issues."
|
||
|
),
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--bypass-tables",
|
||
|
action="store_true",
|
||
|
dest="bypass_tables",
|
||
|
default=config.BYPASS_TABLES,
|
||
|
help="Format tables in HTML rather than Markdown syntax.",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--ignore-tables",
|
||
|
action="store_true",
|
||
|
dest="ignore_tables",
|
||
|
default=config.IGNORE_TABLES,
|
||
|
help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--single-line-break",
|
||
|
action="store_true",
|
||
|
dest="single_line_break",
|
||
|
default=config.SINGLE_LINE_BREAK,
|
||
|
help=(
|
||
|
"Use a single line break after a block element rather than two line "
|
||
|
"breaks. NOTE: Requires --body-width=0"
|
||
|
),
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--unicode-snob",
|
||
|
action="store_true",
|
||
|
dest="unicode_snob",
|
||
|
default=config.UNICODE_SNOB,
|
||
|
help="Use unicode throughout document",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--no-automatic-links",
|
||
|
action="store_false",
|
||
|
dest="use_automatic_links",
|
||
|
default=config.USE_AUTOMATIC_LINKS,
|
||
|
help="Do not use automatic links wherever applicable",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--no-skip-internal-links",
|
||
|
action="store_false",
|
||
|
dest="skip_internal_links",
|
||
|
default=config.SKIP_INTERNAL_LINKS,
|
||
|
help="Do not skip internal links",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--links-after-para",
|
||
|
action="store_true",
|
||
|
dest="links_each_paragraph",
|
||
|
default=config.LINKS_EACH_PARAGRAPH,
|
||
|
help="Put links after each paragraph instead of document",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--mark-code",
|
||
|
action="store_true",
|
||
|
dest="mark_code",
|
||
|
default=config.MARK_CODE,
|
||
|
help="Mark program code blocks with [code]...[/code]",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--decode-errors",
|
||
|
dest="decode_errors",
|
||
|
default=config.DECODE_ERRORS,
|
||
|
help=(
|
||
|
"What to do in case of decode errors.'ignore', 'strict' and 'replace' are "
|
||
|
"acceptable values"
|
||
|
),
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--open-quote",
|
||
|
dest="open_quote",
|
||
|
default=config.OPEN_QUOTE,
|
||
|
help="The character used to open quotes",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--close-quote",
|
||
|
dest="close_quote",
|
||
|
default=config.CLOSE_QUOTE,
|
||
|
help="The character used to close quotes",
|
||
|
)
|
||
|
p.add_argument(
|
||
|
"--version", action="version", version=".".join(map(str, __version__))
|
||
|
)
|
||
|
p.add_argument("filename", nargs="?")
|
||
|
p.add_argument("encoding", nargs="?", default="utf-8")
|
||
|
args = p.parse_args()
|
||
|
|
||
|
if args.filename and args.filename != "-":
|
||
|
with open(args.filename, "rb") as fp:
|
||
|
data = fp.read()
|
||
|
else:
|
||
|
data = sys.stdin.buffer.read()
|
||
|
|
||
|
try:
|
||
|
html = data.decode(args.encoding, args.decode_errors)
|
||
|
except UnicodeDecodeError as err:
|
||
|
warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
|
||
|
warning += " Use the " + bcolors.OKGREEN
|
||
|
warning += "--decode-errors=ignore" + bcolors.ENDC + " flag."
|
||
|
print(warning)
|
||
|
raise err
|
||
|
|
||
|
h = HTML2Text(baseurl=baseurl)
|
||
|
# handle options
|
||
|
if args.ul_style_dash:
|
||
|
h.ul_item_mark = "-"
|
||
|
if args.em_style_asterisk:
|
||
|
h.emphasis_mark = "*"
|
||
|
h.strong_mark = "__"
|
||
|
|
||
|
h.body_width = args.body_width
|
||
|
h.google_list_indent = args.list_indent
|
||
|
h.ignore_emphasis = args.ignore_emphasis
|
||
|
h.ignore_links = args.ignore_links
|
||
|
h.protect_links = args.protect_links
|
||
|
h.ignore_images = args.ignore_images
|
||
|
h.images_as_html = args.images_as_html
|
||
|
h.images_to_alt = args.images_to_alt
|
||
|
h.images_with_size = args.images_with_size
|
||
|
h.google_doc = args.google_doc
|
||
|
h.hide_strikethrough = args.hide_strikethrough
|
||
|
h.escape_snob = args.escape_snob
|
||
|
h.bypass_tables = args.bypass_tables
|
||
|
h.ignore_tables = args.ignore_tables
|
||
|
h.single_line_break = args.single_line_break
|
||
|
h.inline_links = args.inline_links
|
||
|
h.unicode_snob = args.unicode_snob
|
||
|
h.use_automatic_links = args.use_automatic_links
|
||
|
h.skip_internal_links = args.skip_internal_links
|
||
|
h.links_each_paragraph = args.links_each_paragraph
|
||
|
h.mark_code = args.mark_code
|
||
|
h.wrap_links = args.wrap_links
|
||
|
h.wrap_list_items = args.wrap_list_items
|
||
|
h.pad_tables = args.pad_tables
|
||
|
h.default_image_alt = args.default_image_alt
|
||
|
h.open_quote = args.open_quote
|
||
|
h.close_quote = args.close_quote
|
||
|
|
||
|
sys.stdout.write(h.handle(html))
|