From 0ef9017c6bfab3cf21f5d5139a10b396a3ae4438 Mon Sep 17 00:00:00 2001 From: noahskelton Date: Wed, 24 Jun 2026 22:08:09 +0200 Subject: [PATCH 1/3] fix: avoid stack overflows on deep html --- .../src/converter/block/table/scanner.rs | 27 ++- .../src/converter/dom_context.rs | 22 ++- crates/html-to-markdown/src/converter/main.rs | 10 +- .../src/converter/main_helpers.rs | 51 ++++-- .../src/converter/plain_text.rs | 26 ++- .../src/converter/utility/caching.rs | 35 ++-- .../src/options/conversion.rs | 11 +- .../src/types/structure_builder.rs | 169 ++++++++++-------- .../tests/deep_nesting_overflow.rs | 147 +++++++++++++++ .../html-to-markdown/tests/test_max_depth.rs | 9 +- 10 files changed, 373 insertions(+), 134 deletions(-) create mode 100644 crates/html-to-markdown/tests/deep_nesting_overflow.rs diff --git a/crates/html-to-markdown/src/converter/block/table/scanner.rs b/crates/html-to-markdown/src/converter/block/table/scanner.rs index f45b686abf..c201de0ecd 100644 --- a/crates/html-to-markdown/src/converter/block/table/scanner.rs +++ b/crates/html-to-markdown/src/converter/block/table/scanner.rs @@ -53,9 +53,9 @@ pub fn scan_table( scan } -/// Recursively scan table structure. +/// Scan table structure. /// -/// Internal recursive function that walks the table tree and collects metadata. +/// Internal function that walks the table tree and collects metadata. /// /// # Arguments /// * `node_handle` - Current node to scan @@ -71,7 +71,16 @@ fn scan_table_node( is_root: bool, scan: &mut TableScan, ) { - if let Some(node) = node_handle.get(parser) { + // Explicit work stack instead of native recursion: a table that wraps + // deeply nested content (thousands of levels) would otherwise overflow the + // native stack and abort. Visitation order does not affect the scan — every + // field is an order-independent accumulator (`row_counts` is later read only + // via `len()` and a distinct-value check). + let mut work = vec![(*node_handle, is_root)]; + while let Some((node_handle, is_root)) = work.pop() { + let Some(node) = node_handle.get(parser) else { + continue; + }; match node { tl::Node::Raw(bytes) if !scan.has_text => { let raw = bytes.as_utf8_str(); @@ -123,16 +132,20 @@ fn scan_table_node( } } } - scan_table_node(child, parser, dom_ctx, false, scan); } scan.row_counts.push(cell_count); - return; + let children: Vec<_> = tag.children().top().iter().copied().collect(); + for child in children.into_iter().rev() { + work.push((child, false)); + } + continue; } _ => {} } - for child in tag.children().top().iter() { - scan_table_node(child, parser, dom_ctx, false, scan); + let children: Vec<_> = tag.children().top().iter().copied().collect(); + for child in children.into_iter().rev() { + work.push((child, false)); } } _ => {} diff --git a/crates/html-to-markdown/src/converter/dom_context.rs b/crates/html-to-markdown/src/converter/dom_context.rs index 7eea24bf53..0adf02a05f 100644 --- a/crates/html-to-markdown/src/converter/dom_context.rs +++ b/crates/html-to-markdown/src/converter/dom_context.rs @@ -286,7 +286,13 @@ impl DomContext { pub(crate) fn text_content_uncached(&self, node_handle: tl::NodeHandle, parser: &tl::Parser) -> String { let mut text = String::with_capacity(64); - if let Some(node) = node_handle.get(parser) { + + let mut stack = vec![node_handle]; + while let Some(handle) = stack.pop() { + let Some(node) = handle.get(parser) else { + continue; + }; + match node { tl::Node::Raw(bytes) => { let raw = bytes.as_utf8_str(); @@ -294,9 +300,17 @@ impl DomContext { text.push_str(decoded.as_ref()); } tl::Node::Tag(tag) => { - let children = tag.children(); - for child_handle in children.top().iter() { - text.push_str(&self.text_content(*child_handle, parser)); + if let Some(children) = self.children_of(handle.get_inner()) { + for child_handle in children.iter().rev() { + stack.push(*child_handle); + } + } else { + let children = tag.children(); + let mut child_handles: Vec<_> = children.top().iter().copied().collect(); + child_handles.reverse(); + for child_handle in child_handles { + stack.push(child_handle); + } } } tl::Node::Comment(_) => {} diff --git a/crates/html-to-markdown/src/converter/main.rs b/crates/html-to-markdown/src/converter/main.rs index 2c9235d69c..cc1ff9e819 100644 --- a/crates/html-to-markdown/src/converter/main.rs +++ b/crates/html-to-markdown/src/converter/main.rs @@ -15,8 +15,8 @@ use std::collections::{BTreeMap, HashSet}; use crate::converter::dom_context::DomContext; use crate::converter::main_helpers::{ - collapse_excess_blank_lines, extract_head_metadata, format_metadata_frontmatter, has_custom_element_tags, - repair_with_html5ever, trim_line_end_whitespace, trim_trailing_whitespace, + collapse_excess_blank_lines, effective_max_depth, extract_head_metadata, format_metadata_frontmatter, + has_custom_element_tags, repair_with_html5ever, trim_line_end_whitespace, trim_trailing_whitespace, }; use crate::converter::plain_text::extract_plain_text; use crate::converter::preprocessing_helpers::{has_inline_block_misnest, should_drop_for_preprocessing}; @@ -339,10 +339,8 @@ pub fn walk_node( ) { let Some(node) = node_handle.get(parser) else { return }; - if let Some(max) = options.max_depth { - if depth >= max { - return; - } + if depth >= effective_max_depth(options) { + return; } match node { diff --git a/crates/html-to-markdown/src/converter/main_helpers.rs b/crates/html-to-markdown/src/converter/main_helpers.rs index b2f9d0702d..1fa6e0dc7d 100644 --- a/crates/html-to-markdown/src/converter/main_helpers.rs +++ b/crates/html-to-markdown/src/converter/main_helpers.rs @@ -6,6 +6,14 @@ use std::collections::BTreeMap; use crate::options::ConversionOptions; +use crate::options::conversion::NATIVE_STACK_SAFE_DEPTH; + +pub(crate) fn effective_max_depth(options: &ConversionOptions) -> usize { + options + .max_depth + .unwrap_or(NATIVE_STACK_SAFE_DEPTH) + .min(NATIVE_STACK_SAFE_DEPTH) +} /// Compare two tag names case-insensitively. pub fn tag_name_eq(a: impl AsRef, b: &str) -> bool { @@ -330,11 +338,29 @@ pub fn extract_head_metadata( parser: &tl::Parser, options: &ConversionOptions, ) -> BTreeMap { - let mut metadata = BTreeMap::new(); + // Pre-order search with an explicit work stack rather than native + // recursion: documents with thousands of unclosed elements nest into a + // linear chain that deep, and recursing it to look for `` would + // overflow the native stack and abort the process. Returns the first + // `` that yields metadata in document order, matching the previous + // depth-first, first-non-empty-wins behavior. + let mut work = vec![*node_handle]; + while let Some(handle) = work.pop() { + let Some(tl::Node::Tag(tag)) = handle.get(parser) else { + continue; + }; + + if !tag.name().as_utf8_str().eq_ignore_ascii_case("head") { + // Not a head tag: queue children so they pop in document order. + let children: Vec<_> = tag.children().top().iter().copied().collect(); + for child_handle in children.into_iter().rev() { + work.push(child_handle); + } + continue; + } - if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) { - // Check if this is a head tag - if tag.name().as_utf8_str().eq_ignore_ascii_case("head") { + let mut metadata = BTreeMap::new(); + { let children = tag.children(); for child_handle in children.top().iter() { if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) { @@ -402,20 +428,15 @@ pub fn extract_head_metadata( } } } - } else { - // If this is not a head tag, recursively search children for head tag - let children = tag.children(); - for child_handle in children.top().iter() { - let child_metadata = extract_head_metadata(child_handle, parser, options); - if !child_metadata.is_empty() { - metadata.extend(child_metadata); - break; // Only process first head tag found - } - } } + + if !metadata.is_empty() { + return metadata; + } + // Empty head carries no metadata: keep searching for a later one. } - metadata + BTreeMap::new() } /// Check if text has more than one character. diff --git a/crates/html-to-markdown/src/converter/plain_text.rs b/crates/html-to-markdown/src/converter/plain_text.rs index 2ac7813d86..99547937ff 100644 --- a/crates/html-to-markdown/src/converter/plain_text.rs +++ b/crates/html-to-markdown/src/converter/plain_text.rs @@ -7,6 +7,7 @@ use std::collections::HashSet; use std::fmt::Write; +use crate::converter::main_helpers::effective_max_depth; use crate::converter::preprocessing_helpers::should_drop_for_preprocessing; use crate::options::ConversionOptions; use crate::text; @@ -143,6 +144,10 @@ fn walk_plain( list_ctx: &mut ListContext, state: &WalkState<'_>, ) { + if state.depth >= effective_max_depth(state.options) { + return; + } + let Some(node) = node_handle.get(parser) else { return; }; @@ -388,7 +393,7 @@ fn walk_children( /// Walk a `` element, extracting cells as tab-separated, rows as newline-separated. fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, state: &WalkState<'_>) { - // Collect all node handles by recursing into the table + // Collect all node handles from the table subtree. let mut row_handles = Vec::new(); collect_descendant_handles(table_tag, parser, "tr", &mut row_handles); @@ -428,7 +433,7 @@ fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, st } } -/// Recursively collect all descendant `NodeHandle`s matching `target_tag` (by cloning handles). +/// Collect all descendant `NodeHandle`s matching `target_tag` (by cloning handles). fn collect_descendant_handles( tag: &tl::HTMLTag, parser: &tl::Parser, @@ -436,13 +441,20 @@ fn collect_descendant_handles( result: &mut Vec, ) { let children = tag.children(); - let top = children.top(); - for child in top.iter() { - if let Some(tl::Node::Tag(child_tag)) = child.get(parser) { + let mut stack: Vec<_> = children.top().iter().copied().collect(); + stack.reverse(); + + while let Some(handle) = stack.pop() { + if let Some(tl::Node::Tag(child_tag)) = handle.get(parser) { if child_tag.name().as_utf8_str().eq_ignore_ascii_case(target_tag) { - result.push(*child); + result.push(handle); } else { - collect_descendant_handles(child_tag, parser, target_tag, result); + let child_children = child_tag.children(); + let mut child_handles: Vec<_> = child_children.top().iter().copied().collect(); + child_handles.reverse(); + for child in child_handles { + stack.push(child); + } } } } diff --git a/crates/html-to-markdown/src/converter/utility/caching.rs b/crates/html-to-markdown/src/converter/utility/caching.rs index 40ac1110b4..568f16411c 100644 --- a/crates/html-to-markdown/src/converter/utility/caching.rs +++ b/crates/html-to-markdown/src/converter/utility/caching.rs @@ -47,7 +47,7 @@ pub fn text_cache_capacity_for_input(input_len: usize) -> NonZeroUsize { NonZeroUsize::new(target).unwrap_or(NonZeroUsize::MIN) } -/// Recursively record node hierarchy into DOM context. +/// Record node hierarchy into DOM context. /// /// Builds the complete parent-child relationship map for efficient tree traversal. pub fn record_node_hierarchy( @@ -56,19 +56,28 @@ pub fn record_node_hierarchy( parser: &tl::Parser, ctx: &mut DomContext, ) { - let id = node_handle.get_inner(); - ctx.ensure_capacity(id); - ctx.parent_map[id as usize] = parent; - ctx.node_map[id as usize] = Some(node_handle); + // Traverse with an explicit work stack rather than native recursion. `tl` + // does not apply HTML5 implied-end-tags, so a document with thousands of + // unclosed elements (e.g. `
` or `
`) nests into a linear chain + // thousands deep; recursing it would overflow the native stack and abort + // the process. Each node only writes its own slots, so visitation order is + // immaterial to the resulting maps. + let mut work = vec![(node_handle, parent)]; + while let Some((node_handle, parent)) = work.pop() { + let id = node_handle.get_inner(); + ctx.ensure_capacity(id); + ctx.parent_map[id as usize] = parent; + ctx.node_map[id as usize] = Some(node_handle); - if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) { - let children: Vec<_> = tag.children().top().iter().copied().collect(); - for (index, child) in children.iter().enumerate() { - let child_id = child.get_inner(); - ctx.ensure_capacity(child_id); - ctx.sibling_index_map[child_id as usize] = Some(index); - record_node_hierarchy(*child, Some(id), parser, ctx); + if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) { + let children: Vec<_> = tag.children().top().iter().copied().collect(); + for (index, child) in children.iter().enumerate() { + let child_id = child.get_inner(); + ctx.ensure_capacity(child_id); + ctx.sibling_index_map[child_id as usize] = Some(index); + work.push((*child, Some(id))); + } + ctx.children_map[id as usize] = Some(children); } - ctx.children_map[id as usize] = Some(children); } } diff --git a/crates/html-to-markdown/src/options/conversion.rs b/crates/html-to-markdown/src/options/conversion.rs index 47f820e66b..a140c6c69a 100644 --- a/crates/html-to-markdown/src/options/conversion.rs +++ b/crates/html-to-markdown/src/options/conversion.rs @@ -6,6 +6,10 @@ use crate::options::validation::{ UrlEscapeStyle, WhitespaceMode, }; +/// Native recursion guard used when DOM traversal would otherwise be unlimited +/// or set beyond the process stack's safe range. +pub(crate) const NATIVE_STACK_SAFE_DEPTH: usize = 64; + /// Controls which conversion tier is used. #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] #[cfg_attr( @@ -168,8 +172,11 @@ pub struct ConversionOptions { pub capture_svg: bool, /// Infer image dimensions from data. pub infer_dimensions: bool, - /// Maximum DOM traversal depth. `None` means unlimited. - /// When set, subtrees beyond this depth are silently truncated. + /// Maximum DOM traversal depth. + /// + /// `None` uses the library's internal native-stack safety limit. Explicit + /// values above that safety limit are clamped to prevent process-aborting + /// stack overflows on pathologically deep DOM trees. pub max_depth: Option, /// CSS selectors for elements to exclude entirely (element + all content). /// diff --git a/crates/html-to-markdown/src/types/structure_builder.rs b/crates/html-to-markdown/src/types/structure_builder.rs index 6c8587f530..299d3fc594 100644 --- a/crates/html-to-markdown/src/types/structure_builder.rs +++ b/crates/html-to-markdown/src/types/structure_builder.rs @@ -6,22 +6,26 @@ use std::collections::HashMap; +use crate::options::conversion::NATIVE_STACK_SAFE_DEPTH; + use super::document::{AnnotationKind, DocumentNode, DocumentStructure, MetadataEntry, NodeContent, TextAnnotation}; use super::tables::{GridCell, TableGrid}; +fn reversed_child_handles(tag: &tl::HTMLTag) -> Vec { + let children = tag.children(); + let mut handles: Vec<_> = children.top().iter().copied().collect(); + handles.reverse(); + handles +} + // ── Text extraction ─────────────────────────────────────────────────────────── /// Extract plain text from a tag's descendants, decoding HTML entities. fn extract_text(tag: &tl::HTMLTag, parser: &tl::Parser) -> String { let mut buf = String::new(); - collect_text_from_tag(tag, parser, &mut buf); - buf -} + let mut stack = reversed_child_handles(tag); -/// Recursively accumulate text content from a tag's children. -fn collect_text_from_tag(tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String) { - let children = tag.children(); - for handle in children.top().iter() { + while let Some(handle) = stack.pop() { let Some(node) = handle.get(parser) else { continue; }; @@ -37,11 +41,15 @@ fn collect_text_from_tag(tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut Strin if matches!(name.as_str(), "script" | "style" | "head") { continue; } - collect_text_from_tag(child_tag, parser, buf); + for child_handle in reversed_child_handles(child_tag) { + stack.push(child_handle); + } } tl::Node::Comment(_) => {} } } + + buf } // ── Inline annotation extraction ───────────────────────────────────────────── @@ -51,20 +59,32 @@ fn collect_text_from_tag(tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut Strin /// `text` is the pre-extracted full text of the enclosing block node; annotation /// byte offsets are computed relative to that string. fn collect_annotations(tag: &tl::HTMLTag, parser: &tl::Parser, text: &str, annotations: &mut Vec) { - collect_annotations_from_tag(tag, parser, text, &mut 0usize, annotations); -} + enum Frame { + Visit(tl::NodeHandle), + Finish { start: usize, kind: Option }, + } + + let mut offset = 0usize; + let mut stack: Vec<_> = reversed_child_handles(tag).into_iter().map(Frame::Visit).collect(); + + while let Some(frame) = stack.pop() { + let handle = match frame { + Frame::Visit(handle) => handle, + Frame::Finish { start, kind } => { + let end = offset; + if let Some(kind) = kind { + if start < end && end <= text.len() { + annotations.push(TextAnnotation { + start: start as u32, + end: end as u32, + kind, + }); + } + } + continue; + } + }; -/// Recursive helper. `offset` tracks how many bytes of `full_text` have been consumed -/// so far; it is mutated in place as we walk the tree. -fn collect_annotations_from_tag( - tag: &tl::HTMLTag, - parser: &tl::Parser, - full_text: &str, - offset: &mut usize, - annotations: &mut Vec, -) { - let children = tag.children(); - for handle in children.top().iter() { let Some(node) = handle.get(parser) else { continue; }; @@ -72,7 +92,7 @@ fn collect_annotations_from_tag( tl::Node::Raw(bytes) => { let raw = bytes.as_utf8_str(); let decoded = crate::text::decode_html_entities_cow(raw.as_ref()); - *offset += decoded.len(); + offset += decoded.len(); } tl::Node::Tag(child_tag) => { let name = child_tag.name().as_utf8_str().to_ascii_lowercase(); @@ -80,46 +100,35 @@ fn collect_annotations_from_tag( continue; } - let start = *offset; - // Recurse to advance offset over the child's text span. - collect_annotations_from_tag(child_tag, parser, full_text, offset, annotations); - let end = *offset; - - // Emit annotation only for non-empty spans that fit within the text. - if start < end && end <= full_text.len() { - let kind = match name.as_str() { - "strong" | "b" => Some(AnnotationKind::Bold), - "em" | "i" => Some(AnnotationKind::Italic), - "u" | "ins" => Some(AnnotationKind::Underline), - "s" | "del" | "strike" => Some(AnnotationKind::Strikethrough), - "code" | "kbd" | "samp" => Some(AnnotationKind::Code), - "sub" => Some(AnnotationKind::Subscript), - "sup" => Some(AnnotationKind::Superscript), - "mark" => Some(AnnotationKind::Highlight), - "a" => { - let url = child_tag - .attributes() - .get("href") - .flatten() - .map(|v| v.as_utf8_str().to_string()) - .unwrap_or_default(); - let title = child_tag - .attributes() - .get("title") - .flatten() - .map(|v| v.as_utf8_str().to_string()); - Some(AnnotationKind::Link { url, title }) - } - _ => None, - }; - - if let Some(kind) = kind { - annotations.push(TextAnnotation { - start: start as u32, - end: end as u32, - kind, - }); + let kind = match name.as_str() { + "strong" | "b" => Some(AnnotationKind::Bold), + "em" | "i" => Some(AnnotationKind::Italic), + "u" | "ins" => Some(AnnotationKind::Underline), + "s" | "del" | "strike" => Some(AnnotationKind::Strikethrough), + "code" | "kbd" | "samp" => Some(AnnotationKind::Code), + "sub" => Some(AnnotationKind::Subscript), + "sup" => Some(AnnotationKind::Superscript), + "mark" => Some(AnnotationKind::Highlight), + "a" => { + let url = child_tag + .attributes() + .get("href") + .flatten() + .map(|v| v.as_utf8_str().to_string()) + .unwrap_or_default(); + let title = child_tag + .attributes() + .get("title") + .flatten() + .map(|v| v.as_utf8_str().to_string()); + Some(AnnotationKind::Link { url, title }) } + _ => None, + }; + + stack.push(Frame::Finish { start: offset, kind }); + for child_handle in reversed_child_handles(child_tag) { + stack.push(Frame::Visit(child_handle)); } } tl::Node::Comment(_) => {} @@ -131,7 +140,7 @@ fn collect_annotations_from_tag( /// Build a [`TableGrid`] from a `` element. fn extract_table_grid(table_tag: &tl::HTMLTag, parser: &tl::Parser) -> TableGrid { - // Gather all handles (recursing through thead/tbody/tfoot). + // Gather all handles from the table subtree. let mut row_handles: Vec = Vec::new(); collect_tr_handles(table_tag, parser, &mut row_handles); @@ -200,16 +209,19 @@ fn extract_table_grid(table_tag: &tl::HTMLTag, parser: &tl::Parser) -> TableGrid } } -/// Recursively collect all `` `NodeHandle`s from within a table element. +/// Collect all `` `NodeHandle`s from within a table element. fn collect_tr_handles(tag: &tl::HTMLTag, parser: &tl::Parser, result: &mut Vec) { - let children = tag.children(); - for handle in children.top().iter() { + let mut stack = reversed_child_handles(tag); + + while let Some(handle) = stack.pop() { if let Some(tl::Node::Tag(child_tag)) = handle.get(parser) { let name = child_tag.name().as_utf8_str().to_ascii_lowercase(); if name == "tr" { - result.push(*handle); + result.push(handle); } else { - collect_tr_handles(child_tag, parser, result); + for child_handle in reversed_child_handles(child_tag) { + stack.push(child_handle); + } } } } @@ -401,7 +413,7 @@ pub fn build_document_structure(dom: &tl::VDom<'_>) -> DocumentStructure { let mut state = BuilderState::new(); for handle in dom.children() { - walk(&mut state, handle, parser, None); + walk(&mut state, handle, parser, None, 0); } DocumentStructure { @@ -413,7 +425,11 @@ pub fn build_document_structure(dom: &tl::VDom<'_>) -> DocumentStructure { /// Recursive DOM walker. /// /// `parent_idx` is the flat-list index of the nearest structural parent, if any. -fn walk(state: &mut BuilderState, handle: &tl::NodeHandle, parser: &tl::Parser, parent_idx: Option) { +fn walk(state: &mut BuilderState, handle: &tl::NodeHandle, parser: &tl::Parser, parent_idx: Option, depth: usize) { + if depth >= NATIVE_STACK_SAFE_DEPTH { + return; + } + let Some(node) = handle.get(parser) else { return; }; @@ -422,12 +438,12 @@ fn walk(state: &mut BuilderState, handle: &tl::NodeHandle, parser: &tl::Parser, tl::Node::Raw(_) | tl::Node::Comment(_) => {} tl::Node::Tag(tag) => { let tag_name = tag.name().as_utf8_str().to_ascii_lowercase(); - process_tag(state, tag_name.as_str(), tag, parser, parent_idx); + process_tag(state, tag_name.as_str(), tag, parser, parent_idx, depth); } } } -/// Decide how to handle a given tag, creating nodes and recursing as needed. +/// Decide how to handle a given tag, creating nodes and visiting children as needed. #[allow(clippy::too_many_lines)] fn process_tag( state: &mut BuilderState, @@ -435,6 +451,7 @@ fn process_tag( tag: &tl::HTMLTag, parser: &tl::Parser, parent_idx: Option, + depth: usize, ) { match tag_name { // ── Headings ────────────────────────────────────────────────────── @@ -528,7 +545,7 @@ fn process_tag( // Recurse with the list node as the parent so
  • s attach to it. let children = tag.children(); for child_handle in children.top().iter() { - walk(state, child_handle, parser, Some(list_idx)); + walk(state, child_handle, parser, Some(list_idx), depth + 1); } } @@ -663,7 +680,7 @@ fn process_tag( // Recurse into blockquote children under the Quote node. let children = tag.children(); for child_handle in children.top().iter() { - walk(state, child_handle, parser, Some(quote_idx)); + walk(state, child_handle, parser, Some(quote_idx), depth + 1); } } @@ -770,7 +787,7 @@ fn process_tag( } let children = tag.children(); for child_handle in children.top().iter() { - walk(state, child_handle, parser, Some(group_idx)); + walk(state, child_handle, parser, Some(group_idx), depth + 1); } } @@ -779,7 +796,7 @@ fn process_tag( | "form" | "fieldset" => { let children = tag.children(); for child_handle in children.top().iter() { - walk(state, child_handle, parser, parent_idx); + walk(state, child_handle, parser, parent_idx, depth + 1); } } @@ -787,7 +804,7 @@ fn process_tag( _ => { let children = tag.children(); for child_handle in children.top().iter() { - walk(state, child_handle, parser, parent_idx); + walk(state, child_handle, parser, parent_idx, depth + 1); } } } diff --git a/crates/html-to-markdown/tests/deep_nesting_overflow.rs b/crates/html-to-markdown/tests/deep_nesting_overflow.rs new file mode 100644 index 0000000000..636103f1c7 --- /dev/null +++ b/crates/html-to-markdown/tests/deep_nesting_overflow.rs @@ -0,0 +1,147 @@ +#![allow(missing_docs)] + +//! Regression: deeply nested or malformed markup must not overflow the native +//! stack. Real pages with tens of thousands of unclosed `
  • ` nest into a +//! multi-thousand-deep DOM (`tl` does not apply HTML5 implied-end-tags). The +//! auxiliary tree walks (`record_node_hierarchy`, `extract_head_metadata`, +//! `scan_table_node`, descendant text extraction, and plain/document-structure +//! walkers) used native recursion and aborted the process. The whole-subtree +//! helpers are now iterative, and the remaining recursive conversion walkers are +//! bounded by a native-stack safety limit. +//! +//! `` is used rather than `
    ` so the deep chain exercises the auxiliary +//! walks without also driving `walk_node` deep: `td` dispatches to a non- +//! recursing handler, and `walk_node` is independently bounded by `max_depth`. +//! That keeps the bound on native recursion tight enough to catch a regression +//! on a small stack — an unbounded walk over a 20k-deep chain overflows it, +//! while the fixed iterative walks use the heap and pass comfortably. + +use html_to_markdown_rs::convert; +use html_to_markdown_rs::options::{ConversionOptions, OutputFormat}; +use std::sync::{Mutex, MutexGuard}; +use std::thread; + +static TEST_MUTEX: Mutex<()> = Mutex::new(()); + +fn test_lock() -> MutexGuard<'static, ()> { + TEST_MUTEX.lock().expect("deep nesting test mutex poisoned") +} + +fn converts_without_overflow(html: String, options: ConversionOptions) -> bool { + converts_without_overflow_on_stack(html, options, 256 * 1024) +} + +fn converts_without_overflow_on_stack(html: String, options: ConversionOptions, stack_size: usize) -> bool { + thread::Builder::new() + .stack_size(stack_size) + .spawn(move || convert(&html, Some(options)).is_ok()) + .expect("spawn conversion thread") + .join() + .expect("conversion thread overflowed the stack") +} + +/// Exercises `record_node_hierarchy` (pre-pass) and `scan_table_node` (table +/// scan); the `` is found without descending the deep chain. +#[test] +fn deep_unclosed_table_cells_do_not_overflow_stack() { + let _guard = test_lock(); + let mut html = String::from("t"); + for _ in 0..20_000 { + html.push_str("
    x"); + } + html.push_str("
    "); + let options = ConversionOptions::builder().max_depth(Some(200)).build(); + assert!(converts_without_overflow(html, options)); +} + +/// No ``, so `extract_head_metadata` must search the entire deep chain — +/// the path that overflowed on the 2stable.com page. +#[test] +fn deep_markup_without_head_does_not_overflow_stack() { + let _guard = test_lock(); + let mut html = String::from(""); + for _ in 0..20_000 { + html.push_str("
    x"); + } + html.push_str("
    "); + let options = ConversionOptions::builder().max_depth(Some(200)).build(); + assert!(converts_without_overflow(html, options)); +} + +#[test] +fn deep_link_descendant_text_does_not_overflow_stack() { + let _guard = test_lock(); + let mut html = String::from("t"); + for _ in 0..1_000 { + html.push_str(""); + } + html.push_str("deep"); + html.push_str(""); + + let options = ConversionOptions::builder().max_depth(Some(200)).build(); + assert!(converts_without_overflow_on_stack(html, options, 8 * 1024 * 1024)); +} + +#[test] +fn default_depth_uses_stack_safe_limit() { + let _guard = test_lock(); + let mut html = String::from(""); + for _ in 0..1_000 { + html.push_str("
    "); + } + html.push_str("deep"); + for _ in 0..1_000 { + html.push_str("
    "); + } + html.push_str(""); + + assert!(converts_without_overflow(html, ConversionOptions::default())); +} + +#[test] +fn plain_text_output_does_not_overflow_stack() { + let _guard = test_lock(); + let mut html = String::from(""); + for _ in 0..1_000 { + html.push_str("
    "); + } + html.push_str("deep"); + for _ in 0..1_000 { + html.push_str("
    "); + } + html.push_str(""); + + let options = ConversionOptions { + output_format: OutputFormat::Plain, + max_depth: Some(200), + ..Default::default() + }; + assert!(converts_without_overflow(html, options)); +} + +#[test] +fn document_structure_builder_does_not_overflow_stack() { + let _guard = test_lock(); + let mut html = String::from(""); + for _ in 0..1_000 { + html.push_str("
    "); + } + html.push_str("

    deep

    "); + for _ in 0..1_000 { + html.push_str("
    "); + } + html.push_str(""); + + assert!( + thread::Builder::new() + .stack_size(8 * 1024 * 1024) + .spawn(move || { + let dom = tl::parse(&html, tl::ParserOptions::default()).expect("parse deep html"); + let document = html_to_markdown_rs::types::build_document_structure(&dom); + !document.nodes.is_empty() + }) + .expect("spawn structure thread") + .join() + .expect("structure thread overflowed the stack") + ); +} diff --git a/crates/html-to-markdown/tests/test_max_depth.rs b/crates/html-to-markdown/tests/test_max_depth.rs index 76a3b35580..03a94ca570 100644 --- a/crates/html-to-markdown/tests/test_max_depth.rs +++ b/crates/html-to-markdown/tests/test_max_depth.rs @@ -11,12 +11,13 @@ fn convert_with_options(html: &str, options: ConversionOptions) -> String { .unwrap_or_default() } -/// With the default `max_depth: None`, deeply nested content should be fully converted. +/// With the default `max_depth: None`, ordinary nesting below the native stack +/// safety limit should be fully converted. #[test] -fn test_max_depth_none_converts_deeply_nested() { - // Build 100 levels of nesting around a leaf text node. +fn test_max_depth_none_converts_reasonably_nested_content() { + // Build 32 levels of nesting around a leaf text node. let mut html = String::from("

    deep

    "); - for _ in 0..100 { + for _ in 0..32 { html = format!("
    {html}
    "); } From 345b1b60ae5ba713b98f022c8e41b8d3fa4645da Mon Sep 17 00:00:00 2001 From: noahskelton Date: Wed, 24 Jun 2026 22:15:57 +0200 Subject: [PATCH 2/3] docs: clarify stack overflow regression comments --- .../src/converter/block/table/scanner.rs | 8 +++---- .../src/converter/main_helpers.rs | 12 ++++------ .../src/converter/utility/caching.rs | 9 +++---- .../tests/deep_nesting_overflow.rs | 24 +++++++------------ 4 files changed, 19 insertions(+), 34 deletions(-) diff --git a/crates/html-to-markdown/src/converter/block/table/scanner.rs b/crates/html-to-markdown/src/converter/block/table/scanner.rs index c201de0ecd..8ffea56e12 100644 --- a/crates/html-to-markdown/src/converter/block/table/scanner.rs +++ b/crates/html-to-markdown/src/converter/block/table/scanner.rs @@ -71,11 +71,9 @@ fn scan_table_node( is_root: bool, scan: &mut TableScan, ) { - // Explicit work stack instead of native recursion: a table that wraps - // deeply nested content (thousands of levels) would otherwise overflow the - // native stack and abort. Visitation order does not affect the scan — every - // field is an order-independent accumulator (`row_counts` is later read only - // via `len()` and a distinct-value check). + // The work stack keeps table scans on the heap for deeply nested table + // content. Every scan field is an order-independent accumulator; `row_counts` + // is later read through its length and distinct value count. let mut work = vec![(*node_handle, is_root)]; while let Some((node_handle, is_root)) = work.pop() { let Some(node) = node_handle.get(parser) else { diff --git a/crates/html-to-markdown/src/converter/main_helpers.rs b/crates/html-to-markdown/src/converter/main_helpers.rs index 1fa6e0dc7d..983eb8f266 100644 --- a/crates/html-to-markdown/src/converter/main_helpers.rs +++ b/crates/html-to-markdown/src/converter/main_helpers.rs @@ -338,12 +338,10 @@ pub fn extract_head_metadata( parser: &tl::Parser, options: &ConversionOptions, ) -> BTreeMap { - // Pre-order search with an explicit work stack rather than native - // recursion: documents with thousands of unclosed elements nest into a - // linear chain that deep, and recursing it to look for `` would - // overflow the native stack and abort the process. Returns the first - // `` that yields metadata in document order, matching the previous - // depth-first, first-non-empty-wins behavior. + // The work stack keeps the `` search on the heap for malformed + // documents whose unclosed elements form thousand-level DOM chains. Children + // are pushed in reverse so matching still returns the first non-empty + // `` in document order. let mut work = vec![*node_handle]; while let Some(handle) = work.pop() { let Some(tl::Node::Tag(tag)) = handle.get(parser) else { @@ -351,7 +349,7 @@ pub fn extract_head_metadata( }; if !tag.name().as_utf8_str().eq_ignore_ascii_case("head") { - // Not a head tag: queue children so they pop in document order. + // Queue children in reverse so they pop in document order. let children: Vec<_> = tag.children().top().iter().copied().collect(); for child_handle in children.into_iter().rev() { work.push(child_handle); diff --git a/crates/html-to-markdown/src/converter/utility/caching.rs b/crates/html-to-markdown/src/converter/utility/caching.rs index 568f16411c..5b3d5bbd96 100644 --- a/crates/html-to-markdown/src/converter/utility/caching.rs +++ b/crates/html-to-markdown/src/converter/utility/caching.rs @@ -56,12 +56,9 @@ pub fn record_node_hierarchy( parser: &tl::Parser, ctx: &mut DomContext, ) { - // Traverse with an explicit work stack rather than native recursion. `tl` - // does not apply HTML5 implied-end-tags, so a document with thousands of - // unclosed elements (e.g. `
    ` or `
    `) nests into a linear chain - // thousands deep; recursing it would overflow the native stack and abort - // the process. Each node only writes its own slots, so visitation order is - // immaterial to the resulting maps. + // The work stack keeps hierarchy recording on the heap for DOM chains + // created by unclosed elements. Each node writes only its own map entries; + // the same parent/child maps result from any traversal order. let mut work = vec![(node_handle, parent)]; while let Some((node_handle, parent)) = work.pop() { let id = node_handle.get_inner(); diff --git a/crates/html-to-markdown/tests/deep_nesting_overflow.rs b/crates/html-to-markdown/tests/deep_nesting_overflow.rs index 636103f1c7..bb0137520b 100644 --- a/crates/html-to-markdown/tests/deep_nesting_overflow.rs +++ b/crates/html-to-markdown/tests/deep_nesting_overflow.rs @@ -1,20 +1,13 @@ #![allow(missing_docs)] -//! Regression: deeply nested or malformed markup must not overflow the native -//! stack. Real pages with tens of thousands of unclosed `
    ` nest into a -//! multi-thousand-deep DOM (`tl` does not apply HTML5 implied-end-tags). The -//! auxiliary tree walks (`record_node_hierarchy`, `extract_head_metadata`, -//! `scan_table_node`, descendant text extraction, and plain/document-structure -//! walkers) used native recursion and aborted the process. The whole-subtree -//! helpers are now iterative, and the remaining recursive conversion walkers are -//! bounded by a native-stack safety limit. +//! Regression coverage for deeply nested and malformed markup. `tl` preserves +//! repeated unclosed `` tags as a multi-thousand-level DOM chain, so these +//! tests run conversion on small thread stacks to catch native-stack recursion +//! in whole-subtree helpers. //! -//! `` is used rather than `
    ` so the deep chain exercises the auxiliary -//! walks without also driving `walk_node` deep: `td` dispatches to a non- -//! recursing handler, and `walk_node` is independently bounded by `max_depth`. -//! That keeps the bound on native recursion tight enough to catch a regression -//! on a small stack — an unbounded walk over a 20k-deep chain overflows it, -//! while the fixed iterative walks use the heap and pass comfortably. +//! The `
    ` shape reaches hierarchy recording, metadata extraction, and table +//! scanning while the main conversion walker remains bounded by its own depth +//! guard. That keeps failures attributable to the helper traversal under test. use html_to_markdown_rs::convert; use html_to_markdown_rs::options::{ConversionOptions, OutputFormat}; @@ -54,8 +47,7 @@ fn deep_unclosed_table_cells_do_not_overflow_stack() { assert!(converts_without_overflow(html, options)); } -/// No ``, so `extract_head_metadata` must search the entire deep chain — -/// the path that overflowed on the 2stable.com page. +/// No ``, so metadata extraction must search the entire deep chain. #[test] fn deep_markup_without_head_does_not_overflow_stack() { let _guard = test_lock(); From 2a36e84ab5611d4894683f082baf0423f30e79d7 Mon Sep 17 00:00:00 2001 From: noahskelton Date: Wed, 24 Jun 2026 22:28:31 +0200 Subject: [PATCH 3/3] fix: satisfy clippy on stack-safe traversal --- .../html-to-markdown/src/converter/block/table/scanner.rs | 8 ++++---- crates/html-to-markdown/src/converter/main_helpers.rs | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/html-to-markdown/src/converter/block/table/scanner.rs b/crates/html-to-markdown/src/converter/block/table/scanner.rs index 8ffea56e12..f41b819c9f 100644 --- a/crates/html-to-markdown/src/converter/block/table/scanner.rs +++ b/crates/html-to-markdown/src/converter/block/table/scanner.rs @@ -132,8 +132,8 @@ fn scan_table_node( } } scan.row_counts.push(cell_count); - let children: Vec<_> = tag.children().top().iter().copied().collect(); - for child in children.into_iter().rev() { + let mut children: Vec<_> = tag.children().top().iter().copied().collect(); + while let Some(child) = children.pop() { work.push((child, false)); } continue; @@ -141,8 +141,8 @@ fn scan_table_node( _ => {} } - let children: Vec<_> = tag.children().top().iter().copied().collect(); - for child in children.into_iter().rev() { + let mut children: Vec<_> = tag.children().top().iter().copied().collect(); + while let Some(child) = children.pop() { work.push((child, false)); } } diff --git a/crates/html-to-markdown/src/converter/main_helpers.rs b/crates/html-to-markdown/src/converter/main_helpers.rs index 983eb8f266..e56f031642 100644 --- a/crates/html-to-markdown/src/converter/main_helpers.rs +++ b/crates/html-to-markdown/src/converter/main_helpers.rs @@ -8,7 +8,7 @@ use std::collections::BTreeMap; use crate::options::ConversionOptions; use crate::options::conversion::NATIVE_STACK_SAFE_DEPTH; -pub(crate) fn effective_max_depth(options: &ConversionOptions) -> usize { +pub fn effective_max_depth(options: &ConversionOptions) -> usize { options .max_depth .unwrap_or(NATIVE_STACK_SAFE_DEPTH)