Skip to content

Commit a6d9f9a

Browse files
author
Tomasz Grining
committed
feature/add-optional-kwargs: Added optional kwargs
Most of the init arguments in the ``HTML2Text`` class are hardcoded in constants and modifiable only by the cli, not through the library usage. This adds the possibility to pass kwargs through the function call ``html2text`` or class init. Please note that the commit contains syntax that is not recognizable by ``mypy``, but is correct. Note: python/mypy#5719
1 parent 2d2c702 commit a6d9f9a

File tree

6 files changed

+59
-35
lines changed

6 files changed

+59
-35
lines changed

docs/how_it_works.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ Used to provide various configuration settings to the converter. They are as fol
2424
- INLINE_LINKS for formatting images and links
2525
- PROTECT_LINKS protect from line breaks
2626
- GOOGLE_LIST_INDENT no of pixels to indent nested lists
27-
- IGNORE_ANCHORS
27+
- IGNORE_LINKS
2828
- IGNORE_IMAGES
2929
- IMAGES_AS_HTML always generate HTML tags for images; preserves `height`, `width`, `alt` if possible.
3030
- IMAGES_TO_ALT

docs/usage.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ simple indications of their function.
7070
- INLINE_LINKS for formatting images and links
7171
- PROTECT_LINKS protect from line breaks
7272
- GOOGLE_LIST_INDENT no of pixels to indent nested lists
73-
- IGNORE_ANCHORS
73+
- IGNORE_LINKS
7474
- IGNORE_IMAGES
7575
- IMAGES_AS_HTML always generate HTML tags for images; preserves `height`, `width`, `alt` if possible.
7676
- IMAGES_TO_ALT

html2text/__init__.py

+43-31
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,39 @@
3434

3535

3636
class HTML2Text(html.parser.HTMLParser):
37+
init_params = [
38+
"bypass_tables",
39+
"close_quote",
40+
"default_image_alt",
41+
"escape_snob",
42+
"google_list_indent",
43+
"ignore_emphasis",
44+
"ignore_images",
45+
"ignore_links",
46+
"ignore_tables",
47+
"images_as_html",
48+
"images_to_alt",
49+
"images_with_size",
50+
"inline_links",
51+
"links_each_paragraph",
52+
"mark_code",
53+
"open_quote",
54+
"pad_tables",
55+
"protect_links",
56+
"single_line_break",
57+
"skip_internal_links",
58+
"unicode_snob",
59+
"use_automatic_links",
60+
"wrap_links",
61+
"wrap_list_items",
62+
]
63+
3764
def __init__(
3865
self,
3966
out: Optional[OutCallback] = None,
4067
baseurl: str = "",
4168
bodywidth: int = config.BODY_WIDTH,
69+
**kwargs
4270
) -> None:
4371
"""
4472
Input parameters:
@@ -52,37 +80,16 @@ def __init__(
5280
self.split_next_td = False
5381
self.td_count = 0
5482
self.table_start = False
55-
self.unicode_snob = config.UNICODE_SNOB # covered in cli
56-
self.escape_snob = config.ESCAPE_SNOB # covered in cli
57-
self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH
58-
self.body_width = bodywidth # covered in cli
59-
self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli
60-
self.inline_links = config.INLINE_LINKS # covered in cli
61-
self.protect_links = config.PROTECT_LINKS # covered in cli
62-
self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli
63-
self.ignore_links = config.IGNORE_ANCHORS # covered in cli
64-
self.ignore_images = config.IGNORE_IMAGES # covered in cli
65-
self.images_as_html = config.IMAGES_AS_HTML # covered in cli
66-
self.images_to_alt = config.IMAGES_TO_ALT # covered in cli
67-
self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli
68-
self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli
69-
self.bypass_tables = config.BYPASS_TABLES # covered in cli
70-
self.ignore_tables = config.IGNORE_TABLES # covered in cli
71-
self.google_doc = False # covered in cli
72-
self.ul_item_mark = "*" # covered in cli
73-
self.emphasis_mark = "_" # covered in cli
83+
self.google_doc = False
84+
self.ul_item_mark = "*"
85+
self.emphasis_mark = "_"
7486
self.strong_mark = "**"
75-
self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli
76-
self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli
77-
self.hide_strikethrough = False # covered in cli
78-
self.mark_code = config.MARK_CODE
79-
self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli
80-
self.wrap_links = config.WRAP_LINKS # covered in cli
81-
self.pad_tables = config.PAD_TABLES # covered in cli
82-
self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli
87+
self.hide_strikethrough = False
8388
self.tag_callback = None
84-
self.open_quote = config.OPEN_QUOTE # covered in cli
85-
self.close_quote = config.CLOSE_QUOTE # covered in cli
89+
self.body_width = bodywidth
90+
91+
for param in self.init_params:
92+
setattr(self, param, kwargs.get(param, getattr(config, param.upper())))
8693

8794
if out is None:
8895
self.out = self.outtextf
@@ -939,9 +946,14 @@ def optwrap(self, text: str) -> str:
939946
return result
940947

941948

942-
def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str:
949+
def html2text(
950+
html: str,
951+
baseurl: str = "",
952+
bodywidth: Optional[int] = None,
953+
**kwargs: Optional[OutCallback]
954+
) -> str:
943955
if bodywidth is None:
944956
bodywidth = config.BODY_WIDTH
945-
h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
957+
h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth, **kwargs)
946958

947959
return h.handle(html)

html2text/cli.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ class bcolors:
6363
"--ignore-links",
6464
dest="ignore_links",
6565
action="store_true",
66-
default=config.IGNORE_ANCHORS,
66+
default=config.IGNORE_LINKS,
6767
help="don't include any formatting for links",
6868
)
6969
p.add_argument(

html2text/config.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
# Values Google and others may use to indicate bold text
3838
BOLD_TEXT_STYLE_VALUES = ("bold", "700", "800", "900")
3939

40-
IGNORE_ANCHORS = False
40+
IGNORE_LINKS = False
4141
IGNORE_IMAGES = False
4242
IMAGES_AS_HTML = False
4343
IMAGES_TO_ALT = False

test/test_html2text.py

+12
Original file line numberDiff line numberDiff line change
@@ -226,3 +226,15 @@ def _skip_certain_tags(h2t, tag, attrs, start):
226226
"some <i>italics</i> too."
227227
)
228228
assert ret == ("this is a txt and this is a with text and some _italics_ too.\n\n")
229+
230+
231+
def test_kwargs_in_class():
232+
h = html2text.HTML2Text(wrap_links=False)
233+
assert h.wrap_links is False
234+
235+
236+
def test_kwargs_in_function():
237+
test_data = "<a href='http://foo.com/" + "foo-bar/" * 10 + "'>Foo</a>"
238+
wrapped = html2text.html2text(test_data, wrap_links=True)
239+
unwrapped = html2text.html2text(test_data, wrap_links=False)
240+
assert wrapped != unwrapped

0 commit comments

Comments
 (0)