Skip to content

Commit b69c95f

Browse files
authored
Merge pull request #354 from davep/copilot/smarter-first-paragraph-extraction
Smarter first paragraph extraction using the Markdown library
2 parents 4edde24 + 8a7cf35 commit b69c95f

4 files changed

Lines changed: 256 additions & 89 deletions

File tree

ChangeLog.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
# BlogMore ChangeLog
22

3+
## Unreleased
4+
5+
**Released: WiP**
6+
7+
- Replaced the regex-based `extract_first_paragraph` implementation with a
8+
Markdown-library-powered approach.
9+
([#354](https://github.com/davep/blogmore/pull/354))
10+
311
## v2.6.0
412

513
**Released: 2026-03-28**
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
"""First-paragraph extraction from Markdown content.
2+
3+
Converts Markdown to HTML using all BlogMore extensions and then locates
4+
the first top-level paragraph that contains real text, returning it as
5+
plain text. Block-level containers (admonitions, blockquotes, tables,
6+
lists, etc.) are skipped, as are paragraphs that consist entirely of
7+
images.
8+
"""
9+
10+
import re
11+
from html.parser import HTMLParser
12+
from typing import Any
13+
14+
import markdown
15+
16+
from blogmore.markdown.admonitions import AdmonitionsExtension
17+
from blogmore.markdown.external_links import ExternalLinksExtension
18+
from blogmore.markdown.heading_anchors import HeadingAnchorsExtension
19+
from blogmore.markdown.strikethrough import StrikethroughExtension
20+
21+
22+
def create_custom_extensions(site_url: str = "") -> list[Any]:
23+
"""Create instances of all custom BlogMore Markdown extensions.
24+
25+
This is the single source of truth for BlogMore's custom Markdown extension
26+
set. Both the full-rendering parser and the lightweight extraction instance
27+
pull their custom-extension list from here, so any new extension added to
28+
this list is automatically included in both contexts.
29+
30+
Args:
31+
site_url: Base URL of the site; forwarded to
32+
:class:`~blogmore.markdown.external_links.ExternalLinksExtension`
33+
so it can distinguish internal from external links.
34+
35+
Returns:
36+
A list of configured custom Markdown extension instances.
37+
"""
38+
return [
39+
AdmonitionsExtension(),
40+
ExternalLinksExtension(site_url=site_url),
41+
HeadingAnchorsExtension(),
42+
StrikethroughExtension(),
43+
]
44+
45+
46+
def _make_extraction_markdown() -> markdown.Markdown:
47+
"""Create a Markdown instance configured for first-paragraph text extraction.
48+
49+
Includes all BlogMore custom extensions and the standard extensions needed
50+
to correctly identify paragraph boundaries. Intentionally omits
51+
presentation-only extensions such as ``codehilite`` and ``toc`` that are
52+
not required for plain-text extraction.
53+
54+
Returns:
55+
A fresh, configured :class:`markdown.Markdown` instance.
56+
"""
57+
return markdown.Markdown(
58+
extensions=[
59+
"fenced_code",
60+
"tables",
61+
"footnotes",
62+
*create_custom_extensions(),
63+
],
64+
)
65+
66+
67+
class _FirstParagraphExtractor(HTMLParser):
68+
"""HTML parser that extracts plain text from the first non-image-only paragraph.
69+
70+
Only top-level ``<p>`` elements are considered; paragraphs nested inside
71+
block-level containers such as admonition ``<div>`` elements, blockquotes,
72+
or list items are skipped. A paragraph that consists entirely of images
73+
(no text data) is also skipped so that posts that open with a banner image
74+
return the following descriptive paragraph instead.
75+
"""
76+
77+
_BLOCK_TAGS: frozenset[str] = frozenset(
78+
{
79+
"div",
80+
"blockquote",
81+
"ul",
82+
"ol",
83+
"table",
84+
"thead",
85+
"tbody",
86+
"tr",
87+
"td",
88+
"th",
89+
"pre",
90+
"figure",
91+
"section",
92+
"article",
93+
"aside",
94+
"nav",
95+
"header",
96+
"footer",
97+
"main",
98+
}
99+
)
100+
101+
def __init__(self) -> None:
102+
"""Initialise the extractor.
103+
104+
Sets up all tracking state used during parsing:
105+
106+
* ``_block_depth`` — current nesting level inside block-level container
107+
elements (``<div>``, ``<blockquote>``, ``<ul>``, etc.). Any ``<p>``
108+
encountered while this is non-zero is nested and therefore skipped.
109+
* ``_in_paragraph`` — whether the parser is currently inside a
110+
candidate top-level ``<p>`` element.
111+
* ``_chunks`` — raw character-data fragments collected from the current
112+
paragraph, joined and normalised when the paragraph ends.
113+
* ``_has_text`` — set to ``True`` as soon as non-whitespace data is
114+
seen inside the current paragraph; keeps image-only paragraphs from
115+
being returned.
116+
* ``_result`` — the accepted plain-text paragraph (empty until found).
117+
* ``_done`` — short-circuit flag; once ``True`` all further events are
118+
ignored.
119+
"""
120+
super().__init__(convert_charrefs=True)
121+
self._block_depth: int = 0
122+
self._in_paragraph: bool = False
123+
self._chunks: list[str] = []
124+
self._has_text: bool = False
125+
self._result: str = ""
126+
self._done: bool = False
127+
128+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
129+
"""Process an opening HTML tag.
130+
131+
Args:
132+
tag: The lowercase tag name.
133+
attrs: List of ``(attribute-name, value)`` pairs for the tag.
134+
"""
135+
if self._done:
136+
return
137+
if tag in self._BLOCK_TAGS:
138+
self._block_depth += 1
139+
elif tag == "p" and self._block_depth == 0 and not self._in_paragraph:
140+
self._in_paragraph = True
141+
self._chunks = []
142+
self._has_text = False
143+
144+
def handle_endtag(self, tag: str) -> None:
145+
"""Process a closing HTML tag.
146+
147+
Args:
148+
tag: The lowercase tag name.
149+
"""
150+
if self._done:
151+
return
152+
if tag in self._BLOCK_TAGS:
153+
if self._block_depth > 0:
154+
self._block_depth -= 1
155+
elif tag == "p" and self._in_paragraph:
156+
self._in_paragraph = False
157+
text = re.sub(r"\s+", " ", "".join(self._chunks)).strip()
158+
if self._has_text and text:
159+
self._result = text
160+
self._done = True
161+
162+
def handle_data(self, data: str) -> None:
163+
"""Process character data between tags.
164+
165+
Args:
166+
data: The text content between tags.
167+
"""
168+
if self._done or not self._in_paragraph:
169+
return
170+
self._chunks.append(data)
171+
if data.strip():
172+
self._has_text = True
173+
174+
@property
175+
def result(self) -> str:
176+
"""Get the extracted paragraph text.
177+
178+
Returns:
179+
The extracted first paragraph text, or an empty string if none was
180+
found.
181+
"""
182+
return self._result
183+
184+
185+
def extract_first_paragraph(content: str) -> str:
186+
"""Extract the first paragraph from markdown content as plain text.
187+
188+
Converts the markdown to HTML using all BlogMore extensions, then finds
189+
the first top-level ``<p>`` element that contains actual text. Paragraphs
190+
that consist solely of images are skipped.
191+
192+
Args:
193+
content: The markdown content to extract from.
194+
195+
Returns:
196+
The first paragraph as plain text, or an empty string if none is found.
197+
"""
198+
if not content.strip():
199+
return ""
200+
html_content = _make_extraction_markdown().convert(content)
201+
extractor = _FirstParagraphExtractor()
202+
extractor.feed(html_content)
203+
return extractor.result

src/blogmore/parser.py

Lines changed: 5 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@
1212
import yaml
1313
from pygments.formatters import HtmlFormatter
1414

15-
from blogmore.markdown.admonitions import AdmonitionsExtension
16-
from blogmore.markdown.external_links import ExternalLinksExtension
17-
from blogmore.markdown.heading_anchors import HeadingAnchorsExtension
18-
from blogmore.markdown.strikethrough import StrikethroughExtension
15+
from blogmore.markdown.first_paragraph import (
16+
create_custom_extensions,
17+
extract_first_paragraph,
18+
)
1919
from blogmore.utils import calculate_reading_time
2020

2121
_DATE_FORMATS = [
@@ -120,77 +120,6 @@ def remove_date_prefix(slug: str) -> str:
120120
return re.sub(r"^\d{4}-\d{2}-\d{2}-", "", slug)
121121

122122

123-
def extract_first_paragraph(content: str) -> str:
124-
"""Extract the first paragraph from markdown content.
125-
126-
Skips images and empty lines to find the first text paragraph.
127-
Removes markdown formatting for a clean description.
128-
129-
Args:
130-
content: The markdown content to extract from
131-
132-
Returns:
133-
The first paragraph as plain text, or empty string if none found
134-
"""
135-
lines = content.strip().split("\n")
136-
paragraph_lines: list[str] = []
137-
in_paragraph = False
138-
139-
for line in lines:
140-
stripped = line.strip()
141-
142-
# Skip empty lines before we start collecting
143-
if not in_paragraph and not stripped:
144-
continue
145-
146-
# Skip image syntax (markdown images, linked images, and HTML img tags)
147-
if stripped.startswith(("![", "[![", "<img")):
148-
continue
149-
150-
# Skip reference link definitions: [label]: URL
151-
if re.match(r"^\[[^\]]+\]:\s+\S", stripped):
152-
continue
153-
154-
# If we hit a heading, code block, or other special syntax, stop if we have content
155-
if stripped.startswith(("#", "```", "---")):
156-
if paragraph_lines:
157-
break
158-
continue
159-
160-
# If we have an empty line and we're in a paragraph, we've reached the end
161-
if not stripped and in_paragraph:
162-
break
163-
164-
# If we have content, add it
165-
if stripped:
166-
in_paragraph = True
167-
paragraph_lines.append(stripped)
168-
169-
# Join the lines and clean up markdown formatting
170-
paragraph = " ".join(paragraph_lines)
171-
172-
# Remove common markdown formatting (order matters — more specific patterns first)
173-
# Remove reference-style links: [text][ref] or [text][] -> text
174-
paragraph = re.sub(r"\[([^\]]+)\]\[[^\]]*\]", r"\1", paragraph)
175-
# Remove inline links: [text](url) -> text
176-
paragraph = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", paragraph)
177-
# Remove remaining bracketed shorthand references: [text] -> text
178-
# (negative lookahead avoids matching already-processed [ or ( that follow)
179-
paragraph = re.sub(r"\[([^\]]+)\](?!\[|\()", r"\1", paragraph)
180-
# Remove strikethrough: ~~text~~ -> text
181-
paragraph = re.sub(r"~~(.+?)~~", r"\1", paragraph)
182-
# Remove bold/italic (asterisk variants): **text** or *text* -> text
183-
paragraph = re.sub(r"\*\*(.+?)\*\*", r"\1", paragraph)
184-
paragraph = re.sub(r"\*(.+?)\*", r"\1", paragraph)
185-
# Remove bold/italic (underscore variants): __text__ or _text_ -> text
186-
paragraph = re.sub(r"__(.+?)__", r"\1", paragraph)
187-
paragraph = re.sub(r"_(.+?)_", r"\1", paragraph)
188-
# Remove inline code: `code` -> code
189-
paragraph = re.sub(r"`([^`]+)`", r"\1", paragraph)
190-
191-
return paragraph.strip()
192-
193-
194123
@dataclass
195124
class Post:
196125
"""Represents a blog post with metadata and content."""
@@ -389,12 +318,6 @@ def __init__(self, site_url: str | None = None) -> None:
389318
Args:
390319
site_url: Optional base URL of the site for determining internal vs external links
391320
"""
392-
# Create custom extension instances
393-
external_links_ext = ExternalLinksExtension(site_url=site_url or "")
394-
admonitions_ext = AdmonitionsExtension()
395-
heading_anchors_ext = HeadingAnchorsExtension()
396-
strikethrough_ext = StrikethroughExtension()
397-
398321
self.markdown = markdown.Markdown(
399322
extensions=[
400323
"meta",
@@ -404,10 +327,7 @@ def __init__(self, site_url: str | None = None) -> None:
404327
"tables",
405328
"toc",
406329
"footnotes",
407-
admonitions_ext,
408-
external_links_ext,
409-
heading_anchors_ext,
410-
strikethrough_ext,
330+
*create_custom_extensions(site_url=site_url or ""),
411331
],
412332
extension_configs={
413333
"codehilite": {

0 commit comments

Comments
 (0)