Skip to content

Commit d46aeee

Browse files
skeetoclaude
andcommitted
Respect Atom title text types
Atom title, subtitle, rights, and summary-style constructs default to plain text when their type attribute is absent. The previous title fix decoded HTML entities by running every Atom title through html_strip, but that also treated literal angle-bracket text as markup and dropped content such as <% ... %>. Parse Atom titles according to their declared type, and keep the compatibility path for old DB rows to entity decoding only. Stored titles are now display strings, so the entry list and detail pane no longer strip them again while rendering or sorting. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 6117fcb commit d46aeee

4 files changed

Lines changed: 31 additions & 18 deletions

File tree

src/db.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -286,12 +286,7 @@ void db_load_feed_titles(Elfeed *app)
286286
const char *url = (const char *)sqlite3_column_text(stmt, 0);
287287
const char *title = (const char *)sqlite3_column_text(stmt, 1);
288288
if (url && title && *title)
289-
// html_strip here is a one-shot cleanup for DBs that
290-
// pre-date feed.cpp's parse-time decoding — existing
291-
// rows may still contain raw `&#8211;`-style entities
292-
// until their feed is next fetched. Idempotent for
293-
// already-clean titles.
294-
app->feed_titles.emplace(url, html_strip(title));
289+
app->feed_titles.emplace(url, title);
295290
}
296291
sqlite3_finalize(stmt);
297292
}

src/entry_detail.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ void EntryDetail::show_entry(Entry *e)
162162
// read) so the subtitle and enclosure strip render properly.
163163
db_entry_load_details(app_, *e);
164164

165-
title_->SetLabel(wxString::FromUTF8(html_strip(e->title)));
165+
title_->SetLabel(wxString::FromUTF8(e->title));
166166

167167
std::string feed_title;
168168
auto fit = app_->feed_titles.find(e->feed_url);

src/entry_list.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ class EntryListModel : public wxDataViewVirtualListModel {
162162
value = wxString::FromUTF8(format_date(e.date));
163163
return;
164164
case 1:
165-
value = wxString::FromUTF8(html_strip(e.title));
165+
value = wxString::FromUTF8(e.title);
166166
return;
167167
case 2: {
168168
auto it = app_->feed_titles.find(e.feed_url);
@@ -296,7 +296,7 @@ void EntryList::apply_sort()
296296
if (a.date != b.date) c = a.date < b.date ? -1 : 1;
297297
break;
298298
case 1:
299-
c = ci_compare(html_strip(a.title), html_strip(b.title));
299+
c = ci_compare(a.title, b.title);
300300
break;
301301
case 2: {
302302
auto at = app->feed_titles.find(a.feed_url);

src/feed.cpp

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,31 @@ static std::string atom_content(pugi::xml_node entry)
443443
return result;
444444
}
445445

446+
static std::string atom_text_construct(pugi::xml_node node,
447+
const char *tag)
448+
{
449+
auto text_node = find_child(node, tag);
450+
if (!text_node) return {};
451+
452+
std::string type = attr_val(text_node, "type");
453+
if (type.empty() || type == "text")
454+
return text_node.child_value();
455+
456+
if (type == "html")
457+
return html_strip(text_node.child_value());
458+
459+
if (type == "xhtml") {
460+
std::string result;
461+
for (auto child : text_node.children())
462+
xml_unparse(child, result);
463+
return html_strip(result);
464+
}
465+
466+
// Unknown Atom text construct types are non-conforming; defaulting
467+
// to text preserves data instead of guessing it is markup.
468+
return text_node.child_value();
469+
}
470+
446471
static std::vector<Author> atom_authors(pugi::xml_node node)
447472
{
448473
std::vector<Author> result;
@@ -474,14 +499,7 @@ static void parse_atom(const std::string &url, pugi::xml_node root,
474499
if (!feed_node) feed_node = root;
475500
}
476501

477-
// Titles arrive as `type="html"` in a lot of real-world Atom
478-
// feeds even when the text looks like plain prose, so chars
479-
// end up double-encoded in the XML (`&amp;#8211;` → XML
480-
// decodes to `&#8211;` → html_strip decodes to en-dash).
481-
// Running html_strip here decodes the HTML entities *once*,
482-
// producing a clean display string. Plain-text titles with
483-
// no HTML pass through unchanged.
484-
result.feed_title = elfeed_cleanup(html_strip(child_text(feed_node, "title")));
502+
result.feed_title = elfeed_cleanup(atom_text_construct(feed_node, "title"));
485503

486504
auto feed_authors = atom_authors(feed_node);
487505
if (!feed_authors.empty())
@@ -497,7 +515,7 @@ static void parse_atom(const std::string &url, pugi::xml_node root,
497515
e.feed_url = url;
498516
e.namespace_ = ns;
499517

500-
e.title = elfeed_cleanup(html_strip(child_text(entry_node, "title")));
518+
e.title = elfeed_cleanup(atom_text_construct(entry_node, "title"));
501519

502520
// xml:base for this entry
503521
std::string entry_base = attr_val(entry_node, "base");

0 commit comments

Comments
 (0)