Respect Atom title text types

skeeto · claude · skeeto · commit d46aeee5e3b0 · 2026-04-29T16:15:36.000-04:00
Atom title, subtitle, rights, and summary-style constructs default to
plain text when their type attribute is absent. The previous title fix
decoded HTML entities by running every Atom title through html_strip,
but that also treated literal angle-bracket text as markup and dropped
content such as &lt;% ... %&gt;.

Parse Atom titles according to their declared type, and keep the
compatibility path for old DB rows to entity decoding only. Stored
titles are now display strings, so the entry list and detail pane no
longer strip them again while rendering or sorting.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/db.cpp b/src/db.cpp
@@ -286,12 +286,7 @@ void db_load_feed_titles(Elfeed *app)
         const char *url = (const char *)sqlite3_column_text(stmt, 0);
         const char *title = (const char *)sqlite3_column_text(stmt, 1);
         if (url && title && *title)
-            // html_strip here is a one-shot cleanup for DBs that
-            // pre-date feed.cpp's parse-time decoding — existing
-            // rows may still contain raw `&#8211;`-style entities
-            // until their feed is next fetched. Idempotent for
-            // already-clean titles.
-            app->feed_titles.emplace(url, html_strip(title));
+            app->feed_titles.emplace(url, title);
     }
     sqlite3_finalize(stmt);
 }
diff --git a/src/entry_detail.cpp b/src/entry_detail.cpp
@@ -162,7 +162,7 @@ void EntryDetail::show_entry(Entry *e)
     // read) so the subtitle and enclosure strip render properly.
     db_entry_load_details(app_, *e);
 
-    title_->SetLabel(wxString::FromUTF8(html_strip(e->title)));
+    title_->SetLabel(wxString::FromUTF8(e->title));
 
     std::string feed_title;
     auto fit = app_->feed_titles.find(e->feed_url);
diff --git a/src/entry_list.cpp b/src/entry_list.cpp
@@ -162,7 +162,7 @@ class EntryListModel : public wxDataViewVirtualListModel {
             value = wxString::FromUTF8(format_date(e.date));
             return;
         case 1:
-            value = wxString::FromUTF8(html_strip(e.title));
+            value = wxString::FromUTF8(e.title);
             return;
         case 2: {
             auto it = app_->feed_titles.find(e.feed_url);
@@ -296,7 +296,7 @@ void EntryList::apply_sort()
                 if (a.date != b.date) c = a.date < b.date ? -1 : 1;
                 break;
             case 1:
-                c = ci_compare(html_strip(a.title), html_strip(b.title));
+                c = ci_compare(a.title, b.title);
                 break;
             case 2: {
                 auto at = app->feed_titles.find(a.feed_url);
diff --git a/src/feed.cpp b/src/feed.cpp
@@ -443,6 +443,31 @@ static std::string atom_content(pugi::xml_node entry)
     return result;
 }
 
+static std::string atom_text_construct(pugi::xml_node node,
+                                       const char *tag)
+{
+    auto text_node = find_child(node, tag);
+    if (!text_node) return {};
+
+    std::string type = attr_val(text_node, "type");
+    if (type.empty() || type == "text")
+        return text_node.child_value();
+
+    if (type == "html")
+        return html_strip(text_node.child_value());
+
+    if (type == "xhtml") {
+        std::string result;
+        for (auto child : text_node.children())
+            xml_unparse(child, result);
+        return html_strip(result);
+    }
+
+    // Unknown Atom text construct types are non-conforming; defaulting
+    // to text preserves data instead of guessing it is markup.
+    return text_node.child_value();
+}
+
 static std::vector<Author> atom_authors(pugi::xml_node node)
 {
     std::vector<Author> result;
@@ -474,14 +499,7 @@ static void parse_atom(const std::string &url, pugi::xml_node root,
         if (!feed_node) feed_node = root;
     }
 
-    // Titles arrive as `type="html"` in a lot of real-world Atom
-    // feeds even when the text looks like plain prose, so chars
-    // end up double-encoded in the XML (`&amp;#8211;` → XML
-    // decodes to `&#8211;` → html_strip decodes to en-dash).
-    // Running html_strip here decodes the HTML entities *once*,
-    // producing a clean display string. Plain-text titles with
-    // no HTML pass through unchanged.
-    result.feed_title = elfeed_cleanup(html_strip(child_text(feed_node, "title")));
+    result.feed_title = elfeed_cleanup(atom_text_construct(feed_node, "title"));
 
     auto feed_authors = atom_authors(feed_node);
     if (!feed_authors.empty())
@@ -497,7 +515,7 @@ static void parse_atom(const std::string &url, pugi::xml_node root,
         e.feed_url = url;
         e.namespace_ = ns;
 
-        e.title = elfeed_cleanup(html_strip(child_text(entry_node, "title")));
+        e.title = elfeed_cleanup(atom_text_construct(entry_node, "title"));
 
         // xml:base for this entry
         std::string entry_base = attr_val(entry_node, "base");

Original file line number	Diff line number	Diff line change
`@@ -286,12 +286,7 @@ void db_load_feed_titles(Elfeed *app)`
`286`	`286`	`const char url = (const char )sqlite3_column_text(stmt, 0);`
`287`	`287`	`const char title = (const char )sqlite3_column_text(stmt, 1);`
`288`	`288`	`if (url && title && *title)`
`289`		`- // html_strip here is a one-shot cleanup for DBs that`
`290`		`- // pre-date feed.cpp's parse-time decoding — existing`
`291`		- // rows may still contain raw `–`-style entities
`292`		`- // until their feed is next fetched. Idempotent for`
`293`		`- // already-clean titles.`
`294`		`- app->feed_titles.emplace(url, html_strip(title));`
	`289`	`+ app->feed_titles.emplace(url, title);`
`295`	`290`	`}`
`296`	`291`	`sqlite3_finalize(stmt);`
`297`	`292`	`}`