Skip to content

Commit 5b03cf2

Browse files
committed
textlib: refactor HTML removal logic using GetDataHTML parser
Refactored `removeHTMLParts()` to use the `GetDataHTML` parser class. Added support for removing HTML tag content and preserving tag attributes. Preserved backward compatibility and added examples as doctests. - Introduced `removetags` parameter to remove specified tag blocks - Preserved tag attributes for kept tags - Replaced internal logic with a cleaner, reusable HTMLParser subclass - Added comprehensive docstrings and usage examples Bug: T399378 Change-Id: I4c1d99f4d41b74dd080f3b631c8f184f56a6d637
1 parent b66f67e commit 5b03cf2

File tree

1 file changed

+173
-46
lines changed

1 file changed

+173
-46
lines changed

pywikibot/textlib.py

Lines changed: 173 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from collections import OrderedDict
1212
from collections.abc import Sequence
1313
from contextlib import closing, suppress
14+
from dataclasses import dataclass
1415
from html.parser import HTMLParser
1516
from typing import NamedTuple
1617

@@ -539,82 +540,208 @@ def removeDisabledParts(text: str,
539540
return text
540541

541542

542-
def removeHTMLParts(text: str, keeptags: list[str] | None = None) -> str:
543-
"""Return text without portions where HTML markup is disabled.
543+
def removeHTMLParts(text: str,
544+
keeptags: list[str] | None = None,
545+
*,
546+
removetags: list[str] | None = None) -> str:
547+
"""Remove selected HTML tags, their content, and comments from text.
544548
545-
Parts that can/will be removed are HTML tags and all wiki tags. The
546-
exact set of parts which should NOT be removed can be passed as the
547-
*keeptags* parameter, which defaults to
548-
``['tt', 'nowiki', 'small', 'sup']``.
549+
This function removes HTML tags and their contents for tags listed
550+
in ``removetags``. Tags specified in ``keeptags`` are preserved
551+
along with their content and markup. This is a wrapper around the
552+
:class:`GetDataHTML` parser class.
549553
550554
**Example:**
551555
552-
>>> removeHTMLParts('<div><b><ref><tt>Hi all!</tt></ref></b></div>')
556+
>>> remove = removeHTMLParts
557+
>>> remove('<div><b><ref><tt>Hi all!</tt></ref></b></div>')
553558
'<tt>Hi all!</tt>'
559+
>>> remove('<style><b>This is stylish</b></style>', keeptags=['style'])
560+
'<style></style>'
561+
>>> remove('<a>Note:</a> <b>This is important!<!-- really? --></b>')
562+
'Note: This is important!'
563+
>>> remove('<a>Note:</a> <b>This is important!</b>', removetags=['a'])
564+
' This is important!'
554565
555-
.. seealso:: :class:`_GetDataHTML`
566+
.. caution:: Tag names must be given in lowercase.
567+
568+
.. versionchanged:: 10.3
569+
The *removetags* parameter was added. Refactored to use
570+
:class:`GetDataHTML` and its ``__call__`` method. tag attributes
571+
will be kept.
572+
573+
:param text: The input HTML text to clean.
574+
:param keeptags: List of tag names to keep, including their content
575+
and markup. Defaults to :code:`['tt', 'nowiki', 'small', 'sup']`
576+
if None.
577+
:param removetags: List of tag names whose tags and content should
578+
be removed. The tags ca be preserved if listed in *keeptags*.
579+
Defaults to :code:`['style', 'script']` if None.
580+
:return: The cleaned text with specified HTML parts removed.
556581
"""
557-
# TODO: try to merge with 'removeDisabledParts()' above into one generic
558-
# function
559-
parser = _GetDataHTML()
560-
if keeptags is None:
561-
keeptags = ['tt', 'nowiki', 'small', 'sup']
562-
with closing(parser):
563-
parser.keeptags = keeptags
564-
parser.feed(text)
565-
return parser.textdata
582+
return GetDataHTML(keeptags=keeptags, removetags=removetags)(text)
583+
584+
585+
@dataclass(init=False, eq=False)
586+
class GetDataHTML(HTMLParser):
587+
588+
"""HTML parser that removes unwanted HTML elements and optionally comments.
589+
590+
Tags listed in *keeptags* are preserved. Tags listed in *removetags*
591+
are removed entirely along with their content. Optionally strips HTML
592+
comments. Use via the callable interface or in a :code:`with closing(...)`
593+
block.
594+
595+
.. note::
596+
The callable interface is preferred because it is simpler and
597+
ensures proper resource management automatically. If using the
598+
context manager, be sure to access :attr:`textdata` before calling
599+
:meth:`close`.
600+
601+
.. tabs::
566602
603+
.. tab:: callable interface
567604
568-
class _GetDataHTML(HTMLParser):
605+
.. code-block:: python
569606
570-
"""HTML parser which removes html tags except they are listed in keeptags.
607+
text = ('<html><head><title>Test</title></head>'
608+
'<body><h1><!-- Parse --> me!</h1></body></html>')
571609
572-
The parser is used by :func:`removeHTMLParts` similar to this:
610+
parser = GetDataHTML(keeptags = ['html'])
611+
clean_text = parser(text)
573612
574-
.. code-block:: python
613+
.. tab:: closing block
575614
576-
from contextlib import closing
577-
from pywikibot.textlib import _GetDataHTML
578-
with closing(_GetDataHTML()) as parser:
579-
parser.keeptags = ['html']
580-
parser.feed('<html><head><title>Test</title></head>'
581-
'<body><h1><!-- Parse --> me!</h1></body></html>')
582-
print(parser.textdata)
615+
.. code-block:: python
583616
584-
The result is:
617+
from contextlib import closing
618+
text = ('<html><head><title>Test</title></head>'
619+
'<body><h1><!-- Parse --> me!</h1></body></html>')
585620
586-
.. code-block:: html
621+
parser = GetDataHTML(keeptags = ['html'])
622+
with closing(parser):
623+
parser.feed(text)
624+
clean_text = parser.textdata
587625
588-
<html>Test me!</html>
626+
.. warning:: Save the :attr:`textdata` **before** :meth:`close`
627+
is called; otherwise the cleaned text is empty.
628+
629+
**Usage:**
630+
631+
>>> text = ('<html><head><title>Test</title></head>'
632+
... '<body><h1><!-- Parse --> me!</h1></body></html>')
633+
>>> GetDataHTML()(text)
634+
'Test me!'
635+
>>> GetDataHTML(keeptags=['title'])(text)
636+
'<title>Test</title> me!'
637+
>>> GetDataHTML(removetags=['body'])(text)
638+
'Test'
639+
640+
.. caution:: Tag names must be given in lowercase.
589641
590642
.. versionchanged:: 9.2
591-
This class is no longer a context manager;
592-
:pylib:`contextlib.closing()<contextlib#contextlib.closing>`
593-
should be used instead.
643+
No longer a context manager
644+
645+
.. versionchanged:: 10.3
646+
Public class now. Added support for removals of tag contents.
594647
595648
.. seealso::
649+
- :func:`removeHTMLParts`
596650
- :pylib:`html.parser`
597-
- :pylib:`contextlib#contextlib.closing`
598651
599-
:meta public:
652+
:param keeptags: List of tag names to keep, including their content
653+
and markup. Defaults to :code:`['tt', 'nowiki', 'small', 'sup']`
654+
if None.
655+
:param removetags: List of tag names whose tags and content should
656+
be removed. The tags can be preserved if listed in *keeptags*.
657+
Defaults to :code:`['style', 'script']` if None.
658+
:param removecomments: Whether to remove HTML comments. Defaults to
659+
True.
600660
"""
601661

602-
textdata = ''
603-
keeptags: list[str] = []
662+
def __init__(self, *,
663+
keeptags: list[str] | None = None,
664+
removetags: list[str] | None = None) -> None:
665+
"""Initialize default tags and internal state."""
666+
super().__init__()
667+
self.keeptags: list[str] = (keeptags if keeptags is not None
668+
else ['tt', 'nowiki', 'small', 'sup'])
669+
self.removetags: list[str] = (removetags if removetags is not None
670+
else ['style', 'script'])
671+
672+
#: The cleaned output text collected during parsing.
673+
self.textdata = ''
674+
675+
self._skiptag: str | None = None
604676

605-
def handle_data(self, data) -> None:
606-
"""Add data to text."""
607-
self.textdata += data
677+
def __call__(self, text: str) -> str:
678+
"""Feed the parser with *text* and return cleaned :attr:`textdata`.
608679
609-
def handle_starttag(self, tag, attrs) -> None:
610-
"""Add start tag to text if tag should be kept."""
680+
:param text: The HTML text to parse and clean.
681+
:return: The cleaned text with unwanted tags/content removed.
682+
"""
683+
with closing(self):
684+
self.feed(text)
685+
return self.textdata
686+
687+
def close(self) -> None:
688+
"""Clean current processing and clear :attr:`textdata`."""
689+
self.textdata = ''
690+
self._skiptag = None
691+
super().close()
692+
693+
def handle_data(self, data: str) -> None:
694+
"""Handle plain text content found between tags.
695+
696+
Text is added to the output unless it is located inside a tag
697+
marked for removal.
698+
699+
:param data: The text data between HTML tags.
700+
"""
701+
if not self._skiptag:
702+
self.textdata += data
703+
704+
def handle_starttag(self,
705+
tag: str,
706+
attrs: list[tuple[str, str | None]]) -> None:
707+
"""Handle an opening HTML tag.
708+
709+
Tags listed in *keeptags* are preserved in the output. Tags
710+
listed in *removetags* begin a skip block, and their content
711+
will be excluded from the output.
712+
713+
.. versionchanged:: 10.3
714+
Keep tag attributes.
715+
716+
:param tag: The tag name (e.g., "div", "script") converted to
717+
lowercase.
718+
:param attrs: A list of (name, value) pairs with tag attributes.
719+
"""
611720
if tag in self.keeptags:
612-
self.textdata += f'<{tag}>'
613721

614-
def handle_endtag(self, tag) -> None:
615-
"""Add end tag to text if tag should be kept."""
722+
# Reconstruct attributes for preserved tags
723+
attr_text = ''.join(
724+
f' {name}' if value is None else f' {name}="{value}"'
725+
for name, value in attrs
726+
)
727+
self.textdata += f'<{tag}{attr_text}>'
728+
729+
if tag in self.removetags:
730+
self._skiptag = tag
731+
732+
def handle_endtag(self, tag: str) -> None:
733+
"""Handle a closing HTML tag.
734+
735+
Tags listed in *keeptags* are preserved in the output. A closing
736+
tag that matches the currently skipped tag will end the skip
737+
block.
738+
739+
:param tag: The name of the closing tag.
740+
"""
616741
if tag in self.keeptags:
617742
self.textdata += f'</{tag}>'
743+
if tag in self.removetags and tag == self._skiptag:
744+
self._skiptag = None
618745

619746

620747
def isDisabled(text: str, index: int, tags=None) -> bool:

0 commit comments

Comments
 (0)