diff --git a/docs/how_it_works.md b/docs/how_it_works.md index 3602610..9416a70 100644 --- a/docs/how_it_works.md +++ b/docs/how_it_works.md @@ -24,7 +24,7 @@ Used to provide various configuration settings to the converter. They are as fol - INLINE_LINKS for formatting images and links - PROTECT_LINKS protect from line breaks - GOOGLE_LIST_INDENT no of pixels to indent nested lists - - IGNORE_ANCHORS + - IGNORE_LINKS - IGNORE_IMAGES - IMAGES_AS_HTML always generate HTML tags for images; preserves `height`, `width`, `alt` if possible. - IMAGES_TO_ALT diff --git a/docs/usage.md b/docs/usage.md index a1758d3..9e31989 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -70,7 +70,7 @@ simple indications of their function. - INLINE_LINKS for formatting images and links - PROTECT_LINKS protect from line breaks - GOOGLE_LIST_INDENT no of pixels to indent nested lists - - IGNORE_ANCHORS + - IGNORE_LINKS - IGNORE_IMAGES - IMAGES_AS_HTML always generate HTML tags for images; preserves `height`, `width`, `alt` if possible. - IMAGES_TO_ALT diff --git a/html2text/__init__.py b/html2text/__init__.py index d8e41a1..add0725 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -34,11 +34,39 @@ class HTML2Text(html.parser.HTMLParser): + init_params = [ + "bypass_tables", + "close_quote", + "default_image_alt", + "escape_snob", + "google_list_indent", + "ignore_emphasis", + "ignore_images", + "ignore_links", + "ignore_tables", + "images_as_html", + "images_to_alt", + "images_with_size", + "inline_links", + "links_each_paragraph", + "mark_code", + "open_quote", + "pad_tables", + "protect_links", + "single_line_break", + "skip_internal_links", + "unicode_snob", + "use_automatic_links", + "wrap_links", + "wrap_list_items", + ] + def __init__( self, out: Optional[OutCallback] = None, baseurl: str = "", bodywidth: int = config.BODY_WIDTH, + **kwargs ) -> None: """ Input parameters: @@ -52,37 +80,16 @@ def __init__( self.split_next_td = False self.td_count = 0 self.table_start = False - self.unicode_snob = config.UNICODE_SNOB # covered in cli - self.escape_snob = config.ESCAPE_SNOB # covered in cli - self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH - self.body_width = bodywidth # covered in cli - self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli - self.inline_links = config.INLINE_LINKS # covered in cli - self.protect_links = config.PROTECT_LINKS # covered in cli - self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli - self.ignore_links = config.IGNORE_ANCHORS # covered in cli - self.ignore_images = config.IGNORE_IMAGES # covered in cli - self.images_as_html = config.IMAGES_AS_HTML # covered in cli - self.images_to_alt = config.IMAGES_TO_ALT # covered in cli - self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli - self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli - self.bypass_tables = config.BYPASS_TABLES # covered in cli - self.ignore_tables = config.IGNORE_TABLES # covered in cli - self.google_doc = False # covered in cli - self.ul_item_mark = "*" # covered in cli - self.emphasis_mark = "_" # covered in cli + self.google_doc = False + self.ul_item_mark = "*" + self.emphasis_mark = "_" self.strong_mark = "**" - self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli - self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli - self.hide_strikethrough = False # covered in cli - self.mark_code = config.MARK_CODE - self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli - self.wrap_links = config.WRAP_LINKS # covered in cli - self.pad_tables = config.PAD_TABLES # covered in cli - self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli + self.hide_strikethrough = False self.tag_callback = None - self.open_quote = config.OPEN_QUOTE # covered in cli - self.close_quote = config.CLOSE_QUOTE # covered in cli + self.body_width = bodywidth + + for param in self.init_params: + setattr(self, param, kwargs.get(param, getattr(config, param.upper()))) if out is None: self.out = self.outtextf @@ -939,9 +946,14 @@ def optwrap(self, text: str) -> str: return result -def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str: +def html2text( + html: str, + baseurl: str = "", + bodywidth: Optional[int] = None, + **kwargs: Optional[OutCallback] +) -> str: if bodywidth is None: bodywidth = config.BODY_WIDTH - h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth) + h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth, **kwargs) return h.handle(html) diff --git a/html2text/cli.py b/html2text/cli.py index 30a362e..586ad96 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -63,7 +63,7 @@ class bcolors: "--ignore-links", dest="ignore_links", action="store_true", - default=config.IGNORE_ANCHORS, + default=config.IGNORE_LINKS, help="don't include any formatting for links", ) p.add_argument( diff --git a/html2text/config.py b/html2text/config.py index 2bb38b6..c01525d 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -37,7 +37,7 @@ # Values Google and others may use to indicate bold text BOLD_TEXT_STYLE_VALUES = ("bold", "700", "800", "900") -IGNORE_ANCHORS = False +IGNORE_LINKS = False IGNORE_IMAGES = False IMAGES_AS_HTML = False IMAGES_TO_ALT = False diff --git a/test/test_html2text.py b/test/test_html2text.py index 7bdd679..533852c 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -226,3 +226,15 @@ def _skip_certain_tags(h2t, tag, attrs, start): "some italics too." ) assert ret == ("this is a txt and this is a with text and some _italics_ too.\n\n") + + +def test_kwargs_in_class(): + h = html2text.HTML2Text(wrap_links=False) + assert h.wrap_links is False + + +def test_kwargs_in_function(): + test_data = "Foo" + wrapped = html2text.html2text(test_data, wrap_links=True) + unwrapped = html2text.html2text(test_data, wrap_links=False) + assert wrapped != unwrapped