|
11 | 11 | from collections import OrderedDict |
12 | 12 | from collections.abc import Sequence |
13 | 13 | from contextlib import closing, suppress |
| 14 | +from dataclasses import dataclass |
14 | 15 | from html.parser import HTMLParser |
15 | 16 | from typing import NamedTuple |
16 | 17 |
|
@@ -539,82 +540,208 @@ def removeDisabledParts(text: str, |
539 | 540 | return text |
540 | 541 |
|
541 | 542 |
|
542 | | -def removeHTMLParts(text: str, keeptags: list[str] | None = None) -> str: |
543 | | - """Return text without portions where HTML markup is disabled. |
| 543 | +def removeHTMLParts(text: str, |
| 544 | + keeptags: list[str] | None = None, |
| 545 | + *, |
| 546 | + removetags: list[str] | None = None) -> str: |
| 547 | + """Remove selected HTML tags, their content, and comments from text. |
544 | 548 |
|
545 | | - Parts that can/will be removed are HTML tags and all wiki tags. The |
546 | | - exact set of parts which should NOT be removed can be passed as the |
547 | | - *keeptags* parameter, which defaults to |
548 | | - ``['tt', 'nowiki', 'small', 'sup']``. |
| 549 | + This function removes HTML tags and their contents for tags listed |
| 550 | + in ``removetags``. Tags specified in ``keeptags`` are preserved |
| 551 | + along with their content and markup. This is a wrapper around the |
| 552 | + :class:`GetDataHTML` parser class. |
549 | 553 |
|
550 | 554 | **Example:** |
551 | 555 |
|
552 | | - >>> removeHTMLParts('<div><b><ref><tt>Hi all!</tt></ref></b></div>') |
| 556 | + >>> remove = removeHTMLParts |
| 557 | + >>> remove('<div><b><ref><tt>Hi all!</tt></ref></b></div>') |
553 | 558 | '<tt>Hi all!</tt>' |
| 559 | + >>> remove('<style><b>This is stylish</b></style>', keeptags=['style']) |
| 560 | + '<style></style>' |
| 561 | + >>> remove('<a>Note:</a> <b>This is important!<!-- really? --></b>') |
| 562 | + 'Note: This is important!' |
| 563 | + >>> remove('<a>Note:</a> <b>This is important!</b>', removetags=['a']) |
| 564 | + ' This is important!' |
554 | 565 |
|
555 | | - .. seealso:: :class:`_GetDataHTML` |
| 566 | + .. caution:: Tag names must be given in lowercase. |
| 567 | +
|
| 568 | + .. versionchanged:: 10.3 |
| 569 | + The *removetags* parameter was added. Refactored to use |
| 570 | + :class:`GetDataHTML` and its ``__call__`` method. tag attributes |
| 571 | + will be kept. |
| 572 | +
|
| 573 | + :param text: The input HTML text to clean. |
| 574 | + :param keeptags: List of tag names to keep, including their content |
| 575 | + and markup. Defaults to :code:`['tt', 'nowiki', 'small', 'sup']` |
| 576 | + if None. |
| 577 | + :param removetags: List of tag names whose tags and content should |
| 578 | + be removed. The tags ca be preserved if listed in *keeptags*. |
| 579 | + Defaults to :code:`['style', 'script']` if None. |
| 580 | + :return: The cleaned text with specified HTML parts removed. |
556 | 581 | """ |
557 | | - # TODO: try to merge with 'removeDisabledParts()' above into one generic |
558 | | - # function |
559 | | - parser = _GetDataHTML() |
560 | | - if keeptags is None: |
561 | | - keeptags = ['tt', 'nowiki', 'small', 'sup'] |
562 | | - with closing(parser): |
563 | | - parser.keeptags = keeptags |
564 | | - parser.feed(text) |
565 | | - return parser.textdata |
| 582 | + return GetDataHTML(keeptags=keeptags, removetags=removetags)(text) |
| 583 | + |
| 584 | + |
| 585 | +@dataclass(init=False, eq=False) |
| 586 | +class GetDataHTML(HTMLParser): |
| 587 | + |
| 588 | + """HTML parser that removes unwanted HTML elements and optionally comments. |
| 589 | +
|
| 590 | + Tags listed in *keeptags* are preserved. Tags listed in *removetags* |
| 591 | + are removed entirely along with their content. Optionally strips HTML |
| 592 | + comments. Use via the callable interface or in a :code:`with closing(...)` |
| 593 | + block. |
| 594 | +
|
| 595 | + .. note:: |
| 596 | + The callable interface is preferred because it is simpler and |
| 597 | + ensures proper resource management automatically. If using the |
| 598 | + context manager, be sure to access :attr:`textdata` before calling |
| 599 | + :meth:`close`. |
| 600 | +
|
| 601 | + .. tabs:: |
566 | 602 |
|
| 603 | + .. tab:: callable interface |
567 | 604 |
|
568 | | -class _GetDataHTML(HTMLParser): |
| 605 | + .. code-block:: python |
569 | 606 |
|
570 | | - """HTML parser which removes html tags except they are listed in keeptags. |
| 607 | + text = ('<html><head><title>Test</title></head>' |
| 608 | + '<body><h1><!-- Parse --> me!</h1></body></html>') |
571 | 609 |
|
572 | | - The parser is used by :func:`removeHTMLParts` similar to this: |
| 610 | + parser = GetDataHTML(keeptags = ['html']) |
| 611 | + clean_text = parser(text) |
573 | 612 |
|
574 | | - .. code-block:: python |
| 613 | + .. tab:: closing block |
575 | 614 |
|
576 | | - from contextlib import closing |
577 | | - from pywikibot.textlib import _GetDataHTML |
578 | | - with closing(_GetDataHTML()) as parser: |
579 | | - parser.keeptags = ['html'] |
580 | | - parser.feed('<html><head><title>Test</title></head>' |
581 | | - '<body><h1><!-- Parse --> me!</h1></body></html>') |
582 | | - print(parser.textdata) |
| 615 | + .. code-block:: python |
583 | 616 |
|
584 | | - The result is: |
| 617 | + from contextlib import closing |
| 618 | + text = ('<html><head><title>Test</title></head>' |
| 619 | + '<body><h1><!-- Parse --> me!</h1></body></html>') |
585 | 620 |
|
586 | | - .. code-block:: html |
| 621 | + parser = GetDataHTML(keeptags = ['html']) |
| 622 | + with closing(parser): |
| 623 | + parser.feed(text) |
| 624 | + clean_text = parser.textdata |
587 | 625 |
|
588 | | - <html>Test me!</html> |
| 626 | + .. warning:: Save the :attr:`textdata` **before** :meth:`close` |
| 627 | + is called; otherwise the cleaned text is empty. |
| 628 | +
|
| 629 | + **Usage:** |
| 630 | +
|
| 631 | + >>> text = ('<html><head><title>Test</title></head>' |
| 632 | + ... '<body><h1><!-- Parse --> me!</h1></body></html>') |
| 633 | + >>> GetDataHTML()(text) |
| 634 | + 'Test me!' |
| 635 | + >>> GetDataHTML(keeptags=['title'])(text) |
| 636 | + '<title>Test</title> me!' |
| 637 | + >>> GetDataHTML(removetags=['body'])(text) |
| 638 | + 'Test' |
| 639 | +
|
| 640 | + .. caution:: Tag names must be given in lowercase. |
589 | 641 |
|
590 | 642 | .. versionchanged:: 9.2 |
591 | | - This class is no longer a context manager; |
592 | | - :pylib:`contextlib.closing()<contextlib#contextlib.closing>` |
593 | | - should be used instead. |
| 643 | + No longer a context manager |
| 644 | +
|
| 645 | + .. versionchanged:: 10.3 |
| 646 | + Public class now. Added support for removals of tag contents. |
594 | 647 |
|
595 | 648 | .. seealso:: |
| 649 | + - :func:`removeHTMLParts` |
596 | 650 | - :pylib:`html.parser` |
597 | | - - :pylib:`contextlib#contextlib.closing` |
598 | 651 |
|
599 | | - :meta public: |
| 652 | + :param keeptags: List of tag names to keep, including their content |
| 653 | + and markup. Defaults to :code:`['tt', 'nowiki', 'small', 'sup']` |
| 654 | + if None. |
| 655 | + :param removetags: List of tag names whose tags and content should |
| 656 | + be removed. The tags can be preserved if listed in *keeptags*. |
| 657 | + Defaults to :code:`['style', 'script']` if None. |
| 658 | + :param removecomments: Whether to remove HTML comments. Defaults to |
| 659 | + True. |
600 | 660 | """ |
601 | 661 |
|
602 | | - textdata = '' |
603 | | - keeptags: list[str] = [] |
| 662 | + def __init__(self, *, |
| 663 | + keeptags: list[str] | None = None, |
| 664 | + removetags: list[str] | None = None) -> None: |
| 665 | + """Initialize default tags and internal state.""" |
| 666 | + super().__init__() |
| 667 | + self.keeptags: list[str] = (keeptags if keeptags is not None |
| 668 | + else ['tt', 'nowiki', 'small', 'sup']) |
| 669 | + self.removetags: list[str] = (removetags if removetags is not None |
| 670 | + else ['style', 'script']) |
| 671 | + |
| 672 | + #: The cleaned output text collected during parsing. |
| 673 | + self.textdata = '' |
| 674 | + |
| 675 | + self._skiptag: str | None = None |
604 | 676 |
|
605 | | - def handle_data(self, data) -> None: |
606 | | - """Add data to text.""" |
607 | | - self.textdata += data |
| 677 | + def __call__(self, text: str) -> str: |
| 678 | + """Feed the parser with *text* and return cleaned :attr:`textdata`. |
608 | 679 |
|
609 | | - def handle_starttag(self, tag, attrs) -> None: |
610 | | - """Add start tag to text if tag should be kept.""" |
| 680 | + :param text: The HTML text to parse and clean. |
| 681 | + :return: The cleaned text with unwanted tags/content removed. |
| 682 | + """ |
| 683 | + with closing(self): |
| 684 | + self.feed(text) |
| 685 | + return self.textdata |
| 686 | + |
| 687 | + def close(self) -> None: |
| 688 | + """Clean current processing and clear :attr:`textdata`.""" |
| 689 | + self.textdata = '' |
| 690 | + self._skiptag = None |
| 691 | + super().close() |
| 692 | + |
| 693 | + def handle_data(self, data: str) -> None: |
| 694 | + """Handle plain text content found between tags. |
| 695 | +
|
| 696 | + Text is added to the output unless it is located inside a tag |
| 697 | + marked for removal. |
| 698 | +
|
| 699 | + :param data: The text data between HTML tags. |
| 700 | + """ |
| 701 | + if not self._skiptag: |
| 702 | + self.textdata += data |
| 703 | + |
| 704 | + def handle_starttag(self, |
| 705 | + tag: str, |
| 706 | + attrs: list[tuple[str, str | None]]) -> None: |
| 707 | + """Handle an opening HTML tag. |
| 708 | +
|
| 709 | + Tags listed in *keeptags* are preserved in the output. Tags |
| 710 | + listed in *removetags* begin a skip block, and their content |
| 711 | + will be excluded from the output. |
| 712 | +
|
| 713 | + .. versionchanged:: 10.3 |
| 714 | + Keep tag attributes. |
| 715 | +
|
| 716 | + :param tag: The tag name (e.g., "div", "script") converted to |
| 717 | + lowercase. |
| 718 | + :param attrs: A list of (name, value) pairs with tag attributes. |
| 719 | + """ |
611 | 720 | if tag in self.keeptags: |
612 | | - self.textdata += f'<{tag}>' |
613 | 721 |
|
614 | | - def handle_endtag(self, tag) -> None: |
615 | | - """Add end tag to text if tag should be kept.""" |
| 722 | + # Reconstruct attributes for preserved tags |
| 723 | + attr_text = ''.join( |
| 724 | + f' {name}' if value is None else f' {name}="{value}"' |
| 725 | + for name, value in attrs |
| 726 | + ) |
| 727 | + self.textdata += f'<{tag}{attr_text}>' |
| 728 | + |
| 729 | + if tag in self.removetags: |
| 730 | + self._skiptag = tag |
| 731 | + |
| 732 | + def handle_endtag(self, tag: str) -> None: |
| 733 | + """Handle a closing HTML tag. |
| 734 | +
|
| 735 | + Tags listed in *keeptags* are preserved in the output. A closing |
| 736 | + tag that matches the currently skipped tag will end the skip |
| 737 | + block. |
| 738 | +
|
| 739 | + :param tag: The name of the closing tag. |
| 740 | + """ |
616 | 741 | if tag in self.keeptags: |
617 | 742 | self.textdata += f'</{tag}>' |
| 743 | + if tag in self.removetags and tag == self._skiptag: |
| 744 | + self._skiptag = None |
618 | 745 |
|
619 | 746 |
|
620 | 747 | def isDisabled(text: str, index: int, tags=None) -> bool: |
|
0 commit comments