import argparse import sys from . import HTML2Text, __version__, config def main() -> None: baseurl = "" class bcolors: HEADER = "\033[95m" OKBLUE = "\033[94m" OKGREEN = "\033[92m" WARNING = "\033[93m" FAIL = "\033[91m" ENDC = "\033[0m" BOLD = "\033[1m" UNDERLINE = "\033[4m" p = argparse.ArgumentParser() p.add_argument( "--default-image-alt", dest="default_image_alt", default=config.DEFAULT_IMAGE_ALT, help="The default alt string for images with missing ones", ) p.add_argument( "--pad-tables", dest="pad_tables", action="store_true", default=config.PAD_TABLES, help="pad the cells to equal column width in tables", ) p.add_argument( "--no-wrap-links", dest="wrap_links", action="store_false", default=config.WRAP_LINKS, help="don't wrap links during conversion", ) p.add_argument( "--wrap-list-items", dest="wrap_list_items", action="store_true", default=config.WRAP_LIST_ITEMS, help="wrap list items during conversion", ) p.add_argument( "--ignore-emphasis", dest="ignore_emphasis", action="store_true", default=config.IGNORE_EMPHASIS, help="don't include any formatting for emphasis", ) p.add_argument( "--reference-links", dest="inline_links", action="store_false", default=config.INLINE_LINKS, help="use reference style links instead of inline links", ) p.add_argument( "--ignore-links", dest="ignore_links", action="store_true", default=config.IGNORE_ANCHORS, help="don't include any formatting for links", ) p.add_argument( "--protect-links", dest="protect_links", action="store_true", default=config.PROTECT_LINKS, help="protect links from line breaks surrounding them with angle brackets", ) p.add_argument( "--ignore-images", dest="ignore_images", action="store_true", default=config.IGNORE_IMAGES, help="don't include any formatting for images", ) p.add_argument( "--images-as-html", dest="images_as_html", action="store_true", default=config.IMAGES_AS_HTML, help=( "Always write image tags as raw html; preserves `height`, `width` and " "`alt` if possible." ), ) p.add_argument( "--images-to-alt", dest="images_to_alt", action="store_true", default=config.IMAGES_TO_ALT, help="Discard image data, only keep alt text", ) p.add_argument( "--images-with-size", dest="images_with_size", action="store_true", default=config.IMAGES_WITH_SIZE, help=( "Write image tags with height and width attrs as raw html to retain " "dimensions" ), ) p.add_argument( "-g", "--google-doc", action="store_true", dest="google_doc", default=False, help="convert an html-exported Google Document", ) p.add_argument( "-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, help="use a dash rather than a star for unordered list items", ) p.add_argument( "-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", default=False, help="use an asterisk rather than an underscore for emphasized text", ) p.add_argument( "-b", "--body-width", dest="body_width", type=int, default=config.BODY_WIDTH, help="number of characters per output line, 0 for no wrap", ) p.add_argument( "-i", "--google-list-indent", dest="list_indent", type=int, default=config.GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists", ) p.add_argument( "-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, help="hide strike-through text. only relevant when -g is " "specified as well", ) p.add_argument( "--escape-all", action="store_true", dest="escape_snob", default=False, help=( "Escape all special characters. Output is less readable, but avoids " "corner case formatting issues." ), ) p.add_argument( "--bypass-tables", action="store_true", dest="bypass_tables", default=config.BYPASS_TABLES, help="Format tables in HTML rather than Markdown syntax.", ) p.add_argument( "--ignore-tables", action="store_true", dest="ignore_tables", default=config.IGNORE_TABLES, help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.", ) p.add_argument( "--single-line-break", action="store_true", dest="single_line_break", default=config.SINGLE_LINE_BREAK, help=( "Use a single line break after a block element rather than two line " "breaks. NOTE: Requires --body-width=0" ), ) p.add_argument( "--unicode-snob", action="store_true", dest="unicode_snob", default=config.UNICODE_SNOB, help="Use unicode throughout document", ) p.add_argument( "--no-automatic-links", action="store_false", dest="use_automatic_links", default=config.USE_AUTOMATIC_LINKS, help="Do not use automatic links wherever applicable", ) p.add_argument( "--no-skip-internal-links", action="store_false", dest="skip_internal_links", default=config.SKIP_INTERNAL_LINKS, help="Do not skip internal links", ) p.add_argument( "--links-after-para", action="store_true", dest="links_each_paragraph", default=config.LINKS_EACH_PARAGRAPH, help="Put links after each paragraph instead of document", ) p.add_argument( "--mark-code", action="store_true", dest="mark_code", default=config.MARK_CODE, help="Mark program code blocks with [code]...[/code]", ) p.add_argument( "--decode-errors", dest="decode_errors", default=config.DECODE_ERRORS, help=( "What to do in case of decode errors.'ignore', 'strict' and 'replace' are " "acceptable values" ), ) p.add_argument( "--open-quote", dest="open_quote", default=config.OPEN_QUOTE, help="The character used to open quotes", ) p.add_argument( "--close-quote", dest="close_quote", default=config.CLOSE_QUOTE, help="The character used to close quotes", ) p.add_argument( "--version", action="version", version=".".join(map(str, __version__)) ) p.add_argument("filename", nargs="?") p.add_argument("encoding", nargs="?", default="utf-8") args = p.parse_args() if args.filename and args.filename != "-": with open(args.filename, "rb") as fp: data = fp.read() else: data = sys.stdin.buffer.read() try: html = data.decode(args.encoding, args.decode_errors) except UnicodeDecodeError as err: warning = bcolors.WARNING + "Warning:" + bcolors.ENDC warning += " Use the " + bcolors.OKGREEN warning += "--decode-errors=ignore" + bcolors.ENDC + " flag." print(warning) raise err h = HTML2Text(baseurl=baseurl) # handle options if args.ul_style_dash: h.ul_item_mark = "-" if args.em_style_asterisk: h.emphasis_mark = "*" h.strong_mark = "__" h.body_width = args.body_width h.google_list_indent = args.list_indent h.ignore_emphasis = args.ignore_emphasis h.ignore_links = args.ignore_links h.protect_links = args.protect_links h.ignore_images = args.ignore_images h.images_as_html = args.images_as_html h.images_to_alt = args.images_to_alt h.images_with_size = args.images_with_size h.google_doc = args.google_doc h.hide_strikethrough = args.hide_strikethrough h.escape_snob = args.escape_snob h.bypass_tables = args.bypass_tables h.ignore_tables = args.ignore_tables h.single_line_break = args.single_line_break h.inline_links = args.inline_links h.unicode_snob = args.unicode_snob h.use_automatic_links = args.use_automatic_links h.skip_internal_links = args.skip_internal_links h.links_each_paragraph = args.links_each_paragraph h.mark_code = args.mark_code h.wrap_links = args.wrap_links h.wrap_list_items = args.wrap_list_items h.pad_tables = args.pad_tables h.default_image_alt = args.default_image_alt h.open_quote = args.open_quote h.close_quote = args.close_quote sys.stdout.write(h.handle(html))