Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 18 additions & 7 deletions crates/html-to-markdown/src/converter/block/table/scanner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ pub fn scan_table(
scan
}

/// Recursively scan table structure.
/// Scan table structure.
///
/// Internal recursive function that walks the table tree and collects metadata.
/// Internal function that walks the table tree and collects metadata.
///
/// # Arguments
/// * `node_handle` - Current node to scan
Expand All @@ -71,7 +71,14 @@ fn scan_table_node(
is_root: bool,
scan: &mut TableScan,
) {
if let Some(node) = node_handle.get(parser) {
// The work stack keeps table scans on the heap for deeply nested table
// content. Every scan field is an order-independent accumulator; `row_counts`
// is later read through its length and distinct value count.
let mut work = vec![(*node_handle, is_root)];
while let Some((node_handle, is_root)) = work.pop() {
let Some(node) = node_handle.get(parser) else {
continue;
};
match node {
tl::Node::Raw(bytes) if !scan.has_text => {
let raw = bytes.as_utf8_str();
Expand Down Expand Up @@ -123,16 +130,20 @@ fn scan_table_node(
}
}
}
scan_table_node(child, parser, dom_ctx, false, scan);
}
scan.row_counts.push(cell_count);
return;
let mut children: Vec<_> = tag.children().top().iter().copied().collect();
while let Some(child) = children.pop() {
work.push((child, false));
}
continue;
}
_ => {}
}

for child in tag.children().top().iter() {
scan_table_node(child, parser, dom_ctx, false, scan);
let mut children: Vec<_> = tag.children().top().iter().copied().collect();
while let Some(child) = children.pop() {
work.push((child, false));
}
}
_ => {}
Expand Down
22 changes: 18 additions & 4 deletions crates/html-to-markdown/src/converter/dom_context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -286,17 +286,31 @@ impl DomContext {

pub(crate) fn text_content_uncached(&self, node_handle: tl::NodeHandle, parser: &tl::Parser) -> String {
let mut text = String::with_capacity(64);
if let Some(node) = node_handle.get(parser) {

let mut stack = vec![node_handle];
while let Some(handle) = stack.pop() {
let Some(node) = handle.get(parser) else {
continue;
};

match node {
tl::Node::Raw(bytes) => {
let raw = bytes.as_utf8_str();
let decoded = text::decode_html_entities_cow(raw.as_ref());
text.push_str(decoded.as_ref());
}
tl::Node::Tag(tag) => {
let children = tag.children();
for child_handle in children.top().iter() {
text.push_str(&self.text_content(*child_handle, parser));
if let Some(children) = self.children_of(handle.get_inner()) {
for child_handle in children.iter().rev() {
stack.push(*child_handle);
}
} else {
let children = tag.children();
let mut child_handles: Vec<_> = children.top().iter().copied().collect();
child_handles.reverse();
for child_handle in child_handles {
stack.push(child_handle);
}
}
}
tl::Node::Comment(_) => {}
Expand Down
10 changes: 4 additions & 6 deletions crates/html-to-markdown/src/converter/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ use std::collections::{BTreeMap, HashSet};

use crate::converter::dom_context::DomContext;
use crate::converter::main_helpers::{
collapse_excess_blank_lines, extract_head_metadata, format_metadata_frontmatter, has_custom_element_tags,
repair_with_html5ever, trim_line_end_whitespace, trim_trailing_whitespace,
collapse_excess_blank_lines, effective_max_depth, extract_head_metadata, format_metadata_frontmatter,
has_custom_element_tags, repair_with_html5ever, trim_line_end_whitespace, trim_trailing_whitespace,
};
use crate::converter::plain_text::extract_plain_text;
use crate::converter::preprocessing_helpers::{has_inline_block_misnest, should_drop_for_preprocessing};
Expand Down Expand Up @@ -339,10 +339,8 @@ pub fn walk_node(
) {
let Some(node) = node_handle.get(parser) else { return };

if let Some(max) = options.max_depth {
if depth >= max {
return;
}
if depth >= effective_max_depth(options) {
return;
}

match node {
Expand Down
49 changes: 34 additions & 15 deletions crates/html-to-markdown/src/converter/main_helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@
use std::collections::BTreeMap;

use crate::options::ConversionOptions;
use crate::options::conversion::NATIVE_STACK_SAFE_DEPTH;

pub fn effective_max_depth(options: &ConversionOptions) -> usize {
options
.max_depth
.unwrap_or(NATIVE_STACK_SAFE_DEPTH)
.min(NATIVE_STACK_SAFE_DEPTH)
}

/// Compare two tag names case-insensitively.
pub fn tag_name_eq(a: impl AsRef<str>, b: &str) -> bool {
Expand Down Expand Up @@ -330,11 +338,27 @@ pub fn extract_head_metadata(
parser: &tl::Parser,
options: &ConversionOptions,
) -> BTreeMap<String, String> {
let mut metadata = BTreeMap::new();
// The work stack keeps the `<head>` search on the heap for malformed
// documents whose unclosed elements form thousand-level DOM chains. Children
// are pushed in reverse so matching still returns the first non-empty
// `<head>` in document order.
let mut work = vec![*node_handle];
while let Some(handle) = work.pop() {
let Some(tl::Node::Tag(tag)) = handle.get(parser) else {
continue;
};

if !tag.name().as_utf8_str().eq_ignore_ascii_case("head") {
// Queue children in reverse so they pop in document order.
let children: Vec<_> = tag.children().top().iter().copied().collect();
for child_handle in children.into_iter().rev() {
work.push(child_handle);
}
continue;
}

if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
// Check if this is a head tag
if tag.name().as_utf8_str().eq_ignore_ascii_case("head") {
let mut metadata = BTreeMap::new();
{
let children = tag.children();
for child_handle in children.top().iter() {
if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
Expand Down Expand Up @@ -402,20 +426,15 @@ pub fn extract_head_metadata(
}
}
}
} else {
// If this is not a head tag, recursively search children for head tag
let children = tag.children();
for child_handle in children.top().iter() {
let child_metadata = extract_head_metadata(child_handle, parser, options);
if !child_metadata.is_empty() {
metadata.extend(child_metadata);
break; // Only process first head tag found
}
}
}

if !metadata.is_empty() {
return metadata;
}
// Empty head carries no metadata: keep searching for a later one.
}

metadata
BTreeMap::new()
}

/// Check if text has more than one character.
Expand Down
26 changes: 19 additions & 7 deletions crates/html-to-markdown/src/converter/plain_text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
use std::collections::HashSet;
use std::fmt::Write;

use crate::converter::main_helpers::effective_max_depth;
use crate::converter::preprocessing_helpers::should_drop_for_preprocessing;
use crate::options::ConversionOptions;
use crate::text;
Expand Down Expand Up @@ -143,6 +144,10 @@ fn walk_plain(
list_ctx: &mut ListContext,
state: &WalkState<'_>,
) {
if state.depth >= effective_max_depth(state.options) {
return;
}

let Some(node) = node_handle.get(parser) else {
return;
};
Expand Down Expand Up @@ -388,7 +393,7 @@ fn walk_children(

/// Walk a `<table>` element, extracting cells as tab-separated, rows as newline-separated.
fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, state: &WalkState<'_>) {
// Collect all <tr> node handles by recursing into the table
// Collect all <tr> node handles from the table subtree.
let mut row_handles = Vec::new();
collect_descendant_handles(table_tag, parser, "tr", &mut row_handles);

Expand Down Expand Up @@ -428,21 +433,28 @@ fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, st
}
}

/// Recursively collect all descendant `NodeHandle`s matching `target_tag` (by cloning handles).
/// Collect all descendant `NodeHandle`s matching `target_tag` (by cloning handles).
fn collect_descendant_handles(
tag: &tl::HTMLTag,
parser: &tl::Parser,
target_tag: &str,
result: &mut Vec<tl::NodeHandle>,
) {
let children = tag.children();
let top = children.top();
for child in top.iter() {
if let Some(tl::Node::Tag(child_tag)) = child.get(parser) {
let mut stack: Vec<_> = children.top().iter().copied().collect();
stack.reverse();

while let Some(handle) = stack.pop() {
if let Some(tl::Node::Tag(child_tag)) = handle.get(parser) {
if child_tag.name().as_utf8_str().eq_ignore_ascii_case(target_tag) {
result.push(*child);
result.push(handle);
} else {
collect_descendant_handles(child_tag, parser, target_tag, result);
let child_children = child_tag.children();
let mut child_handles: Vec<_> = child_children.top().iter().copied().collect();
child_handles.reverse();
for child in child_handles {
stack.push(child);
}
}
}
}
Expand Down
32 changes: 19 additions & 13 deletions crates/html-to-markdown/src/converter/utility/caching.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ pub fn text_cache_capacity_for_input(input_len: usize) -> NonZeroUsize {
NonZeroUsize::new(target).unwrap_or(NonZeroUsize::MIN)
}

/// Recursively record node hierarchy into DOM context.
/// Record node hierarchy into DOM context.
///
/// Builds the complete parent-child relationship map for efficient tree traversal.
pub fn record_node_hierarchy(
Expand All @@ -56,19 +56,25 @@ pub fn record_node_hierarchy(
parser: &tl::Parser,
ctx: &mut DomContext,
) {
let id = node_handle.get_inner();
ctx.ensure_capacity(id);
ctx.parent_map[id as usize] = parent;
ctx.node_map[id as usize] = Some(node_handle);
// The work stack keeps hierarchy recording on the heap for DOM chains
// created by unclosed elements. Each node writes only its own map entries;
// the same parent/child maps result from any traversal order.
let mut work = vec![(node_handle, parent)];
while let Some((node_handle, parent)) = work.pop() {
let id = node_handle.get_inner();
ctx.ensure_capacity(id);
ctx.parent_map[id as usize] = parent;
ctx.node_map[id as usize] = Some(node_handle);

if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
let children: Vec<_> = tag.children().top().iter().copied().collect();
for (index, child) in children.iter().enumerate() {
let child_id = child.get_inner();
ctx.ensure_capacity(child_id);
ctx.sibling_index_map[child_id as usize] = Some(index);
record_node_hierarchy(*child, Some(id), parser, ctx);
if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
let children: Vec<_> = tag.children().top().iter().copied().collect();
for (index, child) in children.iter().enumerate() {
let child_id = child.get_inner();
ctx.ensure_capacity(child_id);
ctx.sibling_index_map[child_id as usize] = Some(index);
work.push((*child, Some(id)));
}
ctx.children_map[id as usize] = Some(children);
}
ctx.children_map[id as usize] = Some(children);
}
}
11 changes: 9 additions & 2 deletions crates/html-to-markdown/src/options/conversion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ use crate::options::validation::{
UrlEscapeStyle, WhitespaceMode,
};

/// Native recursion guard used when DOM traversal would otherwise be unlimited
/// or set beyond the process stack's safe range.
pub(crate) const NATIVE_STACK_SAFE_DEPTH: usize = 64;

/// Controls which conversion tier is used.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
#[cfg_attr(
Expand Down Expand Up @@ -168,8 +172,11 @@ pub struct ConversionOptions {
pub capture_svg: bool,
/// Infer image dimensions from data.
pub infer_dimensions: bool,
/// Maximum DOM traversal depth. `None` means unlimited.
/// When set, subtrees beyond this depth are silently truncated.
/// Maximum DOM traversal depth.
///
/// `None` uses the library's internal native-stack safety limit. Explicit
/// values above that safety limit are clamped to prevent process-aborting
/// stack overflows on pathologically deep DOM trees.
pub max_depth: Option<usize>,
/// CSS selectors for elements to exclude entirely (element + all content).
///
Expand Down
Loading
Loading