From 058ee4aea0c1d8359f36c200d43868cbcdc54bd0 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 16 Feb 2026 18:07:17 +0000 Subject: [PATCH 1/3] Fix memory leaks, connection overload, and Rust best practice issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Memory leak fixes: - StreamingParser: buffer, events, and complete_elements vecs never shrank after drain operations, causing unbounded growth in long-lived parsers. Added shrink_to() calls matching StreamingSaxParser behavior. - StructuralIndex: shrink_to_fit() existed but was never called after building. Initial capacity estimates over-allocate by 2-3x; now reclaimed immediately after build_children_from_parents(). - DocumentAccumulator: reduced default pre-allocation from 64KB to 4KB. The old 64KB multiplied quickly across concurrent accumulators. Connection/contention fix: - XPath cache: replaced deep CompiledExpr cloning with Arc. Every cache hit previously cloned all Vec, Strings, and Box recursively. Now it's a cheap Arc pointer bump. Correctness + memory fix: - compare_values in XPath eval: was using format!("{}", node_id) to compare nodes, which compared raw u32 IDs as strings — both wrong per XPath 1.0 spec and wasteful (O(n*m) String allocations). Now uses actual document text content via node_string_value(). - Consolidated duplicated get_node_text_content/collect_text_content from lib.rs into shared dom::node_string_value(). Safety fix: - empty_binary: replaced .unwrap() on OwnedBinary::new(0) with match to avoid potential NIF panic. Best practices: - Added #[must_use] to evaluate(), evaluate_from_node(), compile(), validate_strict(), and XPathValue type. https://claude.ai/code/session_015igpdCrNYKuoPrHWZ5RXYc --- native/rustyxml/src/dom/document.rs | 1 + native/rustyxml/src/dom/mod.rs | 38 ++++++++++++++++++ native/rustyxml/src/index/builder.rs | 5 +++ native/rustyxml/src/lib.rs | 49 ++++------------------- native/rustyxml/src/resource.rs | 5 ++- native/rustyxml/src/strategy/streaming.rs | 15 ++++++- native/rustyxml/src/xpath/compiler.rs | 24 ++++++----- native/rustyxml/src/xpath/eval.rs | 35 +++++++++++----- native/rustyxml/src/xpath/value.rs | 1 + 9 files changed, 109 insertions(+), 64 deletions(-) diff --git a/native/rustyxml/src/dom/document.rs b/native/rustyxml/src/dom/document.rs index e730b52..6514de3 100644 --- a/native/rustyxml/src/dom/document.rs +++ b/native/rustyxml/src/dom/document.rs @@ -629,6 +629,7 @@ impl<'a> DocumentAccess for XmlDocument<'a> { /// document structure, DTD) without allocating nodes, attributes, or a /// string pool. This is the memory-efficient validation path used by /// `parse_strict` before building the structural index. +#[must_use = "validation result should be checked"] pub fn validate_strict(input: &[u8]) -> Result<(), String> { let mut reader = SliceReader::new_strict(input); let mut tag_stack: Vec> = vec![]; diff --git a/native/rustyxml/src/dom/mod.rs b/native/rustyxml/src/dom/mod.rs index 7ab32fd..1f8763b 100644 --- a/native/rustyxml/src/dom/mod.rs +++ b/native/rustyxml/src/dom/mod.rs @@ -80,3 +80,41 @@ pub trait DocumentAccess { 0 } } + +/// Get the XPath string-value of a node per XPath 1.0 spec. +/// +/// - For text/CDATA nodes: returns the text content +/// - For elements: concatenation of all descendant text nodes +/// - For other node types: empty string +pub fn node_string_value(doc: &D, node_id: NodeId) -> String { + let kind = doc.node_kind_of(node_id); + + match kind { + NodeKind::Text | NodeKind::CData => doc.text_content(node_id).unwrap_or("").to_string(), + NodeKind::Element => { + let mut result = String::new(); + collect_descendant_text(doc, node_id, &mut result); + result + } + _ => String::new(), + } +} + +/// Recursively collect text content from all descendant text nodes. +fn collect_descendant_text(doc: &D, node_id: NodeId, result: &mut String) { + for child_id in doc.children_vec(node_id) { + let kind = doc.node_kind_of(child_id); + + match kind { + NodeKind::Text | NodeKind::CData => { + if let Some(text) = doc.text_content(child_id) { + result.push_str(text); + } + } + NodeKind::Element => { + collect_descendant_text(doc, child_id, result); + } + _ => {} + } + } +} diff --git a/native/rustyxml/src/index/builder.rs b/native/rustyxml/src/index/builder.rs index 1ae9674..27aaff9 100644 --- a/native/rustyxml/src/index/builder.rs +++ b/native/rustyxml/src/index/builder.rs @@ -56,6 +56,11 @@ impl<'a> IndexBuilder<'a> { // Build children from parent links self.index.build_children_from_parents(); + // Release over-allocated capacity from initial estimates. + // Estimates are based on input size heuristics and often over-allocate + // by 2-3x. This reclaims significant memory for long-lived documents. + self.index.shrink_to_fit(); + // Debug output for structural index sizing (disabled by default) // Enable by setting RUSTYXML_DEBUG_INDEX=1 environment variable #[cfg(feature = "memory_tracking")] diff --git a/native/rustyxml/src/lib.rs b/native/rustyxml/src/lib.rs index f716379..73c96f7 100644 --- a/native/rustyxml/src/lib.rs +++ b/native/rustyxml/src/lib.rs @@ -407,46 +407,10 @@ fn xpath_string_value_doc<'a>( } } -/// Helper to get text content of a node +/// Helper to get text content of a node. +/// Delegates to the shared `dom::node_string_value` implementation. fn get_node_text_content(doc: &D, node_id: dom::NodeId) -> String { - use dom::NodeKind; - - let kind = doc.node_kind_of(node_id); - - match kind { - NodeKind::Text | NodeKind::CData => doc.text_content(node_id).unwrap_or("").to_string(), - NodeKind::Element => { - let mut result = String::new(); - collect_text_content(doc, node_id, &mut result); - result - } - _ => String::new(), - } -} - -/// Recursively collect text content from descendants -fn collect_text_content( - doc: &D, - node_id: dom::NodeId, - result: &mut String, -) { - use dom::NodeKind; - - for child_id in doc.children_vec(node_id) { - let kind = doc.node_kind_of(child_id); - - match kind { - NodeKind::Text | NodeKind::CData => { - if let Some(text) = doc.text_content(child_id) { - result.push_str(text); - } - } - NodeKind::Element => { - collect_text_content(doc, child_id, result); - } - _ => {} - } - } + dom::node_string_value(doc, node_id) } // ============================================================================ @@ -1352,8 +1316,11 @@ fn encode_attrs(buf: &mut BinaryWriter, input: &[u8], span: (usize, usize)) { /// Return an empty BEAM binary. #[inline] fn empty_binary<'a>(env: Env<'a>) -> Term<'a> { - let owned = rustler::OwnedBinary::new(0).unwrap(); - owned.release(env).encode(env) + // OwnedBinary::new(0) should never fail, but avoid panicking in a NIF. + match rustler::OwnedBinary::new(0) { + Some(owned) => owned.release(env).encode(env), + None => "".encode(env), + } } // ============================================================================ diff --git a/native/rustyxml/src/resource.rs b/native/rustyxml/src/resource.rs index 13436dd..fc1eb36 100644 --- a/native/rustyxml/src/resource.rs +++ b/native/rustyxml/src/resource.rs @@ -275,8 +275,11 @@ pub struct DocumentAccumulator { impl DocumentAccumulator { pub fn new() -> Self { + // Start with 4KB — enough for small documents, grows as needed. + // Previously 64KB which wasted memory for small documents and + // multiplied quickly across concurrent accumulators. Self { - buffer: Mutex::new(Vec::with_capacity(64 * 1024)), + buffer: Mutex::new(Vec::with_capacity(4096)), } } diff --git a/native/rustyxml/src/strategy/streaming.rs b/native/rustyxml/src/strategy/streaming.rs index 663c2a6..7217e08 100644 --- a/native/rustyxml/src/strategy/streaming.rs +++ b/native/rustyxml/src/strategy/streaming.rs @@ -173,6 +173,11 @@ impl StreamingParser { // Remove processed bytes efficiently using drain (no reallocation needed, // just moves remaining bytes to front) self.buffer.drain(..boundary); + + // Shrink buffer to avoid retaining excess capacity from large chunks. + // Cap at 8KB to match initial capacity, preventing unbounded growth + // in long-lived streaming parsers. + self.buffer.shrink_to(8192); } /// Process a slice of the buffer up to the given boundary @@ -382,7 +387,10 @@ impl StreamingParser { std::mem::take(&mut self.events) } else { // Partial take - use drain - self.events.drain(..count).collect() + let taken: Vec<_> = self.events.drain(..count).collect(); + // Shrink remaining events vec to release excess capacity + self.events.shrink_to(64); + taken } } @@ -393,7 +401,10 @@ impl StreamingParser { if count == self.complete_elements.len() { std::mem::take(&mut self.complete_elements) } else { - self.complete_elements.drain(..count).collect() + let taken: Vec<_> = self.complete_elements.drain(..count).collect(); + // Shrink remaining elements vec to release excess capacity + self.complete_elements.shrink_to(16); + taken } } diff --git a/native/rustyxml/src/xpath/compiler.rs b/native/rustyxml/src/xpath/compiler.rs index 7253895..bd079a4 100644 --- a/native/rustyxml/src/xpath/compiler.rs +++ b/native/rustyxml/src/xpath/compiler.rs @@ -6,11 +6,13 @@ use super::parser::{Axis, BinaryOp, Expr, NodeTest, Step}; use lru::LruCache; use std::num::NonZeroUsize; -use std::sync::Mutex; +use std::sync::{Arc, Mutex}; -/// Global LRU cache for compiled XPath expressions -/// Using a Mutex for thread-safety across BEAM schedulers -static XPATH_CACHE: Mutex>> = Mutex::new(None); +/// Global LRU cache for compiled XPath expressions. +/// Using Arc to avoid deep cloning on cache hits — +/// each hit is now a cheap Arc pointer bump instead of cloning +/// all Vec, Strings, and Box recursively. +static XPATH_CACHE: Mutex>>> = Mutex::new(None); /// Cache capacity - tuned for typical XPath usage patterns const CACHE_CAPACITY: usize = 256; @@ -217,26 +219,30 @@ const CACHE_CAPACITY_NONZERO: NonZeroUsize = match NonZeroUsize::new(CACHE_CAPAC None => panic!("CACHE_CAPACITY must be non-zero"), }; -/// Compile an XPath expression string (with caching) -pub fn compile(xpath: &str) -> Result { +/// Compile an XPath expression string (with caching). +/// +/// Returns `Arc` — cache hits are a cheap pointer bump +/// instead of a deep clone of all operations, strings, and predicates. +#[must_use = "compiled XPath expression should be used for evaluation"] +pub fn compile(xpath: &str) -> Result, String> { // Try to get from cache first if let Ok(mut guard) = XPATH_CACHE.lock() { let cache = guard.get_or_insert_with(|| LruCache::new(CACHE_CAPACITY_NONZERO)); if let Some(compiled) = cache.get(xpath) { - return Ok(compiled.clone()); + return Ok(Arc::clone(compiled)); } } // If mutex is poisoned, just skip the cache and compile directly // Not in cache - parse and compile let expr = super::parser::parse(xpath)?; - let compiled = CompiledExpr::compile(&expr); + let compiled = Arc::new(CompiledExpr::compile(&expr)); // Store in cache (if mutex is available) if let Ok(mut guard) = XPATH_CACHE.lock() { let cache = guard.get_or_insert_with(|| LruCache::new(CACHE_CAPACITY_NONZERO)); - cache.put(xpath.to_string(), compiled.clone()); + cache.put(xpath.to_string(), Arc::clone(&compiled)); } Ok(compiled) diff --git a/native/rustyxml/src/xpath/eval.rs b/native/rustyxml/src/xpath/eval.rs index b15dfd1..1094859 100644 --- a/native/rustyxml/src/xpath/eval.rs +++ b/native/rustyxml/src/xpath/eval.rs @@ -21,6 +21,7 @@ pub struct EvalContext<'a, D: DocumentAccess> { } /// Evaluate an XPath expression against any document type +#[must_use = "XPath evaluation result should be used"] pub fn evaluate(doc: &D, xpath: &str) -> Result { let compiled = super::compiler::compile(xpath)?; let context = EvalContext { @@ -33,6 +34,7 @@ pub fn evaluate(doc: &D, xpath: &str) -> Result( doc: &D, context_node: NodeId, @@ -266,8 +268,8 @@ pub fn evaluate_compiled<'a, D: DocumentAccess>( let result = match op { BinaryOp::Or => XPathValue::Boolean(left.to_boolean() || right.to_boolean()), BinaryOp::And => XPathValue::Boolean(left.to_boolean() && right.to_boolean()), - BinaryOp::Eq => compare_values(&left, &right, |a, b| a == b), - BinaryOp::NotEq => compare_values(&left, &right, |a, b| a != b), + BinaryOp::Eq => compare_values(ctx.doc, &left, &right, |a, b| a == b), + BinaryOp::NotEq => compare_values(ctx.doc, &left, &right, |a, b| a != b), BinaryOp::Lt => compare_numbers(&left, &right, |a, b| a < b), BinaryOp::LtEq => compare_numbers(&left, &right, |a, b| a <= b), BinaryOp::Gt => compare_numbers(&left, &right, |a, b| a > b), @@ -306,18 +308,29 @@ pub fn evaluate_compiled<'a, D: DocumentAccess>( Ok(stack.pop().unwrap_or(XPathValue::empty_nodeset())) } -/// Compare two XPath values for equality -fn compare_values(left: &XPathValue, right: &XPathValue, cmp: F) -> XPathValue +/// Compare two XPath values for equality per XPath 1.0 spec. +/// +/// Requires document access to get the string-value of nodes in node-sets. +/// Previously used `format!("{}", node_id)` which compared raw node IDs +/// as numbers — both incorrect semantics and wasteful allocations. +fn compare_values( + doc: &D, + left: &XPathValue, + right: &XPathValue, + cmp: F, +) -> XPathValue where F: Fn(&str, &str) -> bool, { + use crate::dom::node_string_value; + match (left, right) { (XPathValue::NodeSet(ln), XPathValue::NodeSet(rn)) => { // Two node-sets: true if any pair of string values match - for l in ln { - for r in rn { - let ls = format!("{}", l); - let rs = format!("{}", r); + for &l in ln { + let ls = node_string_value(doc, l); + for &r in rn { + let rs = node_string_value(doc, r); if cmp(&ls, &rs) { return XPathValue::Boolean(true); } @@ -326,10 +339,10 @@ where XPathValue::Boolean(false) } (XPathValue::NodeSet(nodes), other) | (other, XPathValue::NodeSet(nodes)) => { - // Node-set vs other: convert other to appropriate type and compare + // Node-set vs other: compare each node's string-value against the other value let other_str = other.to_string_value(); - for n in nodes { - let ns = format!("{}", n); + for &n in nodes { + let ns = node_string_value(doc, n); if cmp(&ns, &other_str) { return XPathValue::Boolean(true); } diff --git a/native/rustyxml/src/xpath/value.rs b/native/rustyxml/src/xpath/value.rs index 2ff4b0c..7162850 100644 --- a/native/rustyxml/src/xpath/value.rs +++ b/native/rustyxml/src/xpath/value.rs @@ -6,6 +6,7 @@ use crate::dom::NodeId; /// XPath value types #[derive(Debug, Clone)] +#[must_use] pub enum XPathValue { /// A set of nodes (ordered, no duplicates) NodeSet(Vec), From 603f5317293f9e57081bf000de7492b7f356c43c Mon Sep 17 00:00:00 2001 From: jeffhuen <32542276+jeffhuen@users.noreply.github.com> Date: Mon, 16 Feb 2026 12:02:36 -0800 Subject: [PATCH 2/3] Fix XPath string-value semantics, optimize NodeSet comparison, resolve all clippy warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix XPathValue::to_string_value() to return empty string for NodeSets instead of misleading format!("[node:{}]", id) — callers with document access now use dom::node_string_value() per XPath 1.0 spec - Add resolve_string() helper to XPath functions for proper NodeSet-to-string conversion; pass document access to all 8 string functions - Replace duplicated get_string_value/collect_text with dom::node_string_value - Remove residual get_node_text_content wrapper from NIF layer - Optimize NodeSet-vs-NodeSet comparison from O(n*m) to O(n+m) by pre-computing right-side string values - Fix streaming parser shrink_to() reallocation churn with 4x threshold - Remove redundant #[must_use] on compile() (Result already has it) - Remove dead inherent methods on XmlDocument duplicated by DocumentAccess trait - Remove unused XmlAttribute re-export from dom/mod.rs - Fix len_zero clippy warnings in unified_scanner tests - Add #[expect(clippy::ptr_arg)] to intern_cow (needs Cow for zero-copy optimization) - Add 4 correctness tests for NodeSet equality and string function semantics Co-Authored-By: Claude Opus 4.6 --- native/rustyxml/src/core/unified_scanner.rs | 4 +- native/rustyxml/src/dom/document.rs | 59 +------- native/rustyxml/src/dom/mod.rs | 2 - native/rustyxml/src/lib.rs | 13 +- native/rustyxml/src/strategy/streaming.rs | 22 +-- native/rustyxml/src/xpath/compiler.rs | 1 - native/rustyxml/src/xpath/eval.rs | 63 ++++++++- native/rustyxml/src/xpath/functions.rs | 143 +++++++++++--------- native/rustyxml/src/xpath/value.rs | 21 +-- 9 files changed, 172 insertions(+), 156 deletions(-) diff --git a/native/rustyxml/src/core/unified_scanner.rs b/native/rustyxml/src/core/unified_scanner.rs index b4c53d5..36db9e4 100644 --- a/native/rustyxml/src/core/unified_scanner.rs +++ b/native/rustyxml/src/core/unified_scanner.rs @@ -630,7 +630,7 @@ mod tests { // The '<' should be emitted as text, then "1invalid/>" as more text assert!( - handler.texts.len() >= 1, + !handler.texts.is_empty(), "Invalid markup should produce text events" ); assert_eq!( @@ -650,7 +650,7 @@ mod tests { // Should have text for "<1bad/>" and one valid element assert!( - handler.texts.len() >= 1, + !handler.texts.is_empty(), "Invalid markup should produce text" ); assert_eq!( diff --git a/native/rustyxml/src/dom/document.rs b/native/rustyxml/src/dom/document.rs index 6514de3..6499fd0 100644 --- a/native/rustyxml/src/dom/document.rs +++ b/native/rustyxml/src/dom/document.rs @@ -52,24 +52,11 @@ impl<'a> XmlDocument<'a> { doc } - /// Parse an XML document in strict mode - pub fn parse_strict(input: &'a [u8]) -> Result { - let mut doc = XmlDocument { - input, - nodes: Vec::with_capacity(256), - attributes: Vec::with_capacity(128), - strings: StringPool::new(), - root_element: None, - }; - - doc.nodes.push(XmlNode::document()); - doc.build_from_events(true)?; - Ok(doc) - } - /// Intern a Cow<[u8]> intelligently: /// - If Borrowed (points into input): use intern_ref (zero-copy) /// - If Owned (entity-decoded): use intern (copies to pool) + // Pattern-matches Borrowed vs Owned to choose zero-copy interning (intern_ref) or copying (intern) + #[expect(clippy::ptr_arg)] #[inline] fn intern_cow(&mut self, cow: &Cow<'_, [u8]>) -> u32 { match cow { @@ -432,26 +419,6 @@ impl<'a> XmlDocument<'a> { self.strings.get_str_with_input(node.name_id, self.input) } - /// Get node local name (without prefix) - pub fn node_local_name(&self, id: NodeId) -> Option<&str> { - let name = self.node_name(id)?; - if let Some(pos) = name.find(':') { - Some(&name[pos + 1..]) - } else { - Some(name) - } - } - - /// Get text content of a text node - pub fn text_content(&self, id: NodeId) -> Option<&str> { - let node = self.get_node(id)?; - if node.is_text() || node.kind == NodeKind::CData { - self.strings.get_str_with_input(node.name_id, self.input) - } else { - None - } - } - /// Get attributes for an element pub fn attributes(&self, id: NodeId) -> &[XmlAttribute] { if let Some(node) = self.get_node(id) { @@ -463,28 +430,6 @@ impl<'a> XmlDocument<'a> { } } - /// Get attribute value by name - pub fn get_attribute(&self, node_id: NodeId, name: &str) -> Option<&str> { - for attr in self.attributes(node_id) { - if self.strings.get_str_with_input(attr.name_id, self.input) == Some(name) { - return self.strings.get_str_with_input(attr.value_id, self.input); - } - } - None - } - - /// Get all attribute names and values for a node - pub fn get_attribute_values(&self, node_id: NodeId) -> Vec<(&str, &str)> { - self.attributes(node_id) - .iter() - .filter_map(|attr| { - let name = self.strings.get_str_with_input(attr.name_id, self.input)?; - let value = self.strings.get_str_with_input(attr.value_id, self.input)?; - Some((name, value)) - }) - .collect() - } - /// Iterate over children of a node pub fn children(&self, id: NodeId) -> ChildIter<'_, 'a> { let first = self.get_node(id).and_then(|n| n.first_child); diff --git a/native/rustyxml/src/dom/mod.rs b/native/rustyxml/src/dom/mod.rs index 1f8763b..8c2114e 100644 --- a/native/rustyxml/src/dom/mod.rs +++ b/native/rustyxml/src/dom/mod.rs @@ -12,8 +12,6 @@ pub mod node; pub mod strings; pub use document::validate_strict; -#[cfg(test)] -pub use node::XmlAttribute; pub use node::{NodeId, NodeKind, XmlNode}; #[cfg(test)] diff --git a/native/rustyxml/src/lib.rs b/native/rustyxml/src/lib.rs index 73c96f7..4c25c2a 100644 --- a/native/rustyxml/src/lib.rs +++ b/native/rustyxml/src/lib.rs @@ -234,7 +234,7 @@ fn xpath_text_list<'a>( Ok(XPathValue::NodeSet(nodes)) => { let mut list = Term::list_new_empty(env); for &id in nodes.iter().rev() { - let text = get_node_text_content(&view, id); + let text = dom::node_string_value(&view, id); let binary = term::bytes_to_binary(env, text.as_bytes()); list = list.list_prepend(binary); } @@ -276,7 +276,7 @@ fn parse_and_xpath_text<'a>( Ok(XPathValue::NodeSet(nodes)) => { let mut list = Term::list_new_empty(env); for &id in nodes.iter().rev() { - let text = get_node_text_content(&view, id); + let text = dom::node_string_value(&view, id); let binary = term::bytes_to_binary(env, text.as_bytes()); list = list.list_prepend(binary); } @@ -365,7 +365,7 @@ fn xpath_string_value<'a>(env: Env<'a>, input: Binary<'a>, xpath_str: &str) -> N xpath::XPathValue::Boolean(b) => b.to_string(), xpath::XPathValue::NodeSet(nodes) => { if let Some(&node_id) = nodes.first() { - get_node_text_content(&view, node_id) + dom::node_string_value(&view, node_id) } else { String::new() } @@ -394,7 +394,7 @@ fn xpath_string_value_doc<'a>( xpath::XPathValue::Boolean(b) => b.to_string(), xpath::XPathValue::NodeSet(nodes) => { if let Some(&node_id) = nodes.first() { - get_node_text_content(&view, node_id) + dom::node_string_value(&view, node_id) } else { String::new() } @@ -407,11 +407,6 @@ fn xpath_string_value_doc<'a>( } } -/// Helper to get text content of a node. -/// Delegates to the shared `dom::node_string_value` implementation. -fn get_node_text_content(doc: &D, node_id: dom::NodeId) -> String { - dom::node_string_value(doc, node_id) -} // ============================================================================ // Streaming Parser diff --git a/native/rustyxml/src/strategy/streaming.rs b/native/rustyxml/src/strategy/streaming.rs index 7217e08..062de29 100644 --- a/native/rustyxml/src/strategy/streaming.rs +++ b/native/rustyxml/src/strategy/streaming.rs @@ -174,10 +174,12 @@ impl StreamingParser { // just moves remaining bytes to front) self.buffer.drain(..boundary); - // Shrink buffer to avoid retaining excess capacity from large chunks. - // Cap at 8KB to match initial capacity, preventing unbounded growth - // in long-lived streaming parsers. - self.buffer.shrink_to(8192); + // Only shrink if excess capacity is significant (> 4x needed). + // Avoids grow-shrink-grow churn in active streaming while still + // reclaiming memory from large spikes in long-lived parsers. + if self.buffer.capacity() > 4 * self.buffer.len().max(8192) { + self.buffer.shrink_to(8192); + } } /// Process a slice of the buffer up to the given boundary @@ -388,8 +390,10 @@ impl StreamingParser { } else { // Partial take - use drain let taken: Vec<_> = self.events.drain(..count).collect(); - // Shrink remaining events vec to release excess capacity - self.events.shrink_to(64); + // Only shrink if excess capacity is significant (> 4x needed) + if self.events.capacity() > 4 * self.events.len().max(64) { + self.events.shrink_to(64); + } taken } } @@ -402,8 +406,10 @@ impl StreamingParser { std::mem::take(&mut self.complete_elements) } else { let taken: Vec<_> = self.complete_elements.drain(..count).collect(); - // Shrink remaining elements vec to release excess capacity - self.complete_elements.shrink_to(16); + // Only shrink if excess capacity is significant (> 4x needed) + if self.complete_elements.capacity() > 4 * self.complete_elements.len().max(16) { + self.complete_elements.shrink_to(16); + } taken } } diff --git a/native/rustyxml/src/xpath/compiler.rs b/native/rustyxml/src/xpath/compiler.rs index bd079a4..89b1503 100644 --- a/native/rustyxml/src/xpath/compiler.rs +++ b/native/rustyxml/src/xpath/compiler.rs @@ -223,7 +223,6 @@ const CACHE_CAPACITY_NONZERO: NonZeroUsize = match NonZeroUsize::new(CACHE_CAPAC /// /// Returns `Arc` — cache hits are a cheap pointer bump /// instead of a deep clone of all operations, strings, and predicates. -#[must_use = "compiled XPath expression should be used for evaluation"] pub fn compile(xpath: &str) -> Result, String> { // Try to get from cache first if let Ok(mut guard) = XPATH_CACHE.lock() { diff --git a/native/rustyxml/src/xpath/eval.rs b/native/rustyxml/src/xpath/eval.rs index 1094859..f9d275d 100644 --- a/native/rustyxml/src/xpath/eval.rs +++ b/native/rustyxml/src/xpath/eval.rs @@ -310,9 +310,10 @@ pub fn evaluate_compiled<'a, D: DocumentAccess>( /// Compare two XPath values for equality per XPath 1.0 spec. /// +/// Only used for `Eq` and `NotEq` operations — relational operators +/// (`Lt`, `Gt`, etc.) use `compare_numbers` with `f64` comparisons. +/// /// Requires document access to get the string-value of nodes in node-sets. -/// Previously used `format!("{}", node_id)` which compared raw node IDs -/// as numbers — both incorrect semantics and wasteful allocations. fn compare_values( doc: &D, left: &XPathValue, @@ -326,12 +327,14 @@ where match (left, right) { (XPathValue::NodeSet(ln), XPathValue::NodeSet(rn)) => { - // Two node-sets: true if any pair of string values match + // Two node-sets: true if any pair of string values match. + // Pre-compute right-side values to avoid O(n*m) recomputation. + let right_strings: Vec = + rn.iter().map(|&r| node_string_value(doc, r)).collect(); for &l in ln { let ls = node_string_value(doc, l); - for &r in rn { - let rs = node_string_value(doc, r); - if cmp(&ls, &rs) { + for rs in &right_strings { + if cmp(&ls, rs) { return XPathValue::Boolean(true); } } @@ -413,4 +416,52 @@ mod tests { let result = evaluate(&doc, "string-length('hello')").unwrap(); assert_eq!(result.to_number(), 5.0); } + + #[test] + fn node_equality_compares_text_content_not_ids() { + let doc = XmlDocument::parse(b"hellohelloworld"); + // Nodes with same text content should be equal + let result = evaluate(&doc, "/r/a = /r/b").unwrap(); + assert!( + result.to_boolean(), + "Nodes with same text 'hello' should be equal" + ); + // Nodes with different text content should not be equal + let result = evaluate(&doc, "/r/a = /r/c").unwrap(); + assert!( + !result.to_boolean(), + "Nodes with different text 'hello' vs 'world' should not be equal" + ); + } + + #[test] + fn node_inequality_compares_text_content() { + let doc = XmlDocument::parse(b"helloworld"); + let result = evaluate(&doc, "/r/a != /r/b").unwrap(); + assert!( + result.to_boolean(), + "Nodes with different text should be not-equal" + ); + } + + #[test] + fn string_function_on_nodeset_returns_text_content() { + let doc = XmlDocument::parse(b"content"); + let result = evaluate(&doc, "string(/r/item)").unwrap(); + assert_eq!( + result.to_string_value(), + "content", + "string() on a node-set should return text content, not node ID" + ); + } + + #[test] + fn contains_function_with_nodeset_arg() { + let doc = XmlDocument::parse(b"hello world"); + let result = evaluate(&doc, "contains(/r/a, 'world')").unwrap(); + assert!( + result.to_boolean(), + "contains() should work with NodeSet first arg" + ); + } } diff --git a/native/rustyxml/src/xpath/functions.rs b/native/rustyxml/src/xpath/functions.rs index a823147..9803290 100644 --- a/native/rustyxml/src/xpath/functions.rs +++ b/native/rustyxml/src/xpath/functions.rs @@ -17,9 +17,9 @@ //! - number(), sum(), floor(), ceiling(), round() use super::value::XPathValue; +use crate::dom::{self, DocumentAccess, NodeId}; #[cfg(test)] use crate::dom::XmlDocument; -use crate::dom::{DocumentAccess, NodeId}; /// Evaluate a function call pub fn call( @@ -42,15 +42,15 @@ pub fn call( // String Functions "string" => fn_string(args, doc, context), - "concat" => fn_concat(args), - "starts-with" => fn_starts_with(args), - "contains" => fn_contains(args), - "substring" => fn_substring(args), - "substring-before" => fn_substring_before(args), - "substring-after" => fn_substring_after(args), - "string-length" => fn_string_length(args), + "concat" => fn_concat(args, doc), + "starts-with" => fn_starts_with(args, doc), + "contains" => fn_contains(args, doc), + "substring" => fn_substring(args, doc), + "substring-before" => fn_substring_before(args, doc), + "substring-after" => fn_substring_after(args, doc), + "string-length" => fn_string_length(args, doc, context), "normalize-space" => fn_normalize_space(args, doc, context), - "translate" => fn_translate(args), + "translate" => fn_translate(args, doc), // Boolean Functions "boolean" => fn_boolean(args), @@ -142,46 +142,54 @@ fn fn_string( context: NodeId, ) -> Result { let value = if args.is_empty() { - // String value of context node - get_string_value(doc, context) + dom::node_string_value(doc, context) } else { - args[0].to_string_value() + resolve_string(&args[0], doc) }; Ok(XPathValue::String(value)) } -fn fn_concat(args: Vec) -> Result { +fn fn_concat(args: Vec, doc: &D) -> Result { if args.len() < 2 { return Err("concat() requires at least 2 arguments".to_string()); } - let result: String = args.iter().map(|a| a.to_string_value()).collect(); + let result: String = args.iter().map(|a| resolve_string(a, doc)).collect(); Ok(XPathValue::String(result)) } -fn fn_starts_with(args: Vec) -> Result { +fn fn_starts_with( + args: Vec, + doc: &D, +) -> Result { if args.len() != 2 { return Err("starts-with() requires exactly 2 arguments".to_string()); } - let s = args[0].to_string_value(); - let prefix = args[1].to_string_value(); + let s = resolve_string(&args[0], doc); + let prefix = resolve_string(&args[1], doc); Ok(XPathValue::Boolean(s.starts_with(&prefix))) } -fn fn_contains(args: Vec) -> Result { +fn fn_contains( + args: Vec, + doc: &D, +) -> Result { if args.len() != 2 { return Err("contains() requires exactly 2 arguments".to_string()); } - let s = args[0].to_string_value(); - let pattern = args[1].to_string_value(); + let s = resolve_string(&args[0], doc); + let pattern = resolve_string(&args[1], doc); Ok(XPathValue::Boolean(s.contains(&pattern))) } -fn fn_substring(args: Vec) -> Result { +fn fn_substring( + args: Vec, + doc: &D, +) -> Result { if args.len() < 2 || args.len() > 3 { return Err("substring() requires 2 or 3 arguments".to_string()); } - let s = args[0].to_string_value(); + let s = resolve_string(&args[0], doc); let start = args[1].to_number().round() as i64 - 1; // XPath is 1-indexed let start = start.max(0) as usize; @@ -198,12 +206,15 @@ fn fn_substring(args: Vec) -> Result { Ok(XPathValue::String(result)) } -fn fn_substring_before(args: Vec) -> Result { +fn fn_substring_before( + args: Vec, + doc: &D, +) -> Result { if args.len() != 2 { return Err("substring-before() requires exactly 2 arguments".to_string()); } - let s = args[0].to_string_value(); - let pattern = args[1].to_string_value(); + let s = resolve_string(&args[0], doc); + let pattern = resolve_string(&args[1], doc); let result = if let Some(pos) = s.find(&pattern) { s[..pos].to_string() @@ -214,12 +225,15 @@ fn fn_substring_before(args: Vec) -> Result { Ok(XPathValue::String(result)) } -fn fn_substring_after(args: Vec) -> Result { +fn fn_substring_after( + args: Vec, + doc: &D, +) -> Result { if args.len() != 2 { return Err("substring-after() requires exactly 2 arguments".to_string()); } - let s = args[0].to_string_value(); - let pattern = args[1].to_string_value(); + let s = resolve_string(&args[0], doc); + let pattern = resolve_string(&args[1], doc); let result = if let Some(pos) = s.find(&pattern) { s[pos + pattern.len()..].to_string() @@ -230,14 +244,18 @@ fn fn_substring_after(args: Vec) -> Result { Ok(XPathValue::String(result)) } -fn fn_string_length(args: Vec) -> Result { +fn fn_string_length( + args: Vec, + doc: &D, + context: NodeId, +) -> Result { if args.len() > 1 { return Err("string-length() requires 0 or 1 arguments".to_string()); } let s = if args.is_empty() { - String::new() // TODO: should be string value of context + dom::node_string_value(doc, context) } else { - args[0].to_string_value() + resolve_string(&args[0], doc) }; Ok(XPathValue::Number(s.chars().count() as f64)) } @@ -248,9 +266,9 @@ fn fn_normalize_space( context: NodeId, ) -> Result { let s = if args.is_empty() { - get_string_value(doc, context) + dom::node_string_value(doc, context) } else if args.len() == 1 { - args[0].to_string_value() + resolve_string(&args[0], doc) } else { return Err("normalize-space() requires 0 or 1 arguments".to_string()); }; @@ -260,14 +278,17 @@ fn fn_normalize_space( Ok(XPathValue::String(normalized)) } -fn fn_translate(args: Vec) -> Result { +fn fn_translate( + args: Vec, + doc: &D, +) -> Result { if args.len() != 3 { return Err("translate() requires exactly 3 arguments".to_string()); } - let s = args[0].to_string_value(); - let from: Vec = args[1].to_string_value().chars().collect(); - let to: Vec = args[2].to_string_value().chars().collect(); + let s = resolve_string(&args[0], doc); + let from: Vec = resolve_string(&args[1], doc).chars().collect(); + let to: Vec = resolve_string(&args[2], doc).chars().collect(); let result: String = s .chars() @@ -312,7 +333,7 @@ fn fn_number( context: NodeId, ) -> Result { let value = if args.is_empty() { - let s = get_string_value(doc, context); + let s = dom::node_string_value(doc, context); s.trim().parse().unwrap_or(f64::NAN) } else if args.len() == 1 { args[0].to_number() @@ -331,7 +352,7 @@ fn fn_sum(args: Vec, doc: &D) -> Result { let mut total = 0.0; for &node in nodes { - let s = get_string_value(doc, node); + let s = dom::node_string_value(doc, node); if let Ok(n) = s.trim().parse::() { total += n; } else { @@ -372,26 +393,21 @@ fn fn_round(args: Vec) -> Result { Ok(XPathValue::Number(rounded)) } -/// Get the string value of a node -fn get_string_value(doc: &D, node_id: NodeId) -> String { - // For text nodes, return the text content - if let Some(text) = doc.text_content(node_id) { - return text.to_string(); - } - - // For element nodes, concatenate all descendant text - let mut result = String::new(); - collect_text(doc, node_id, &mut result); - result -} - -fn collect_text(doc: &D, node_id: NodeId, result: &mut String) { - if let Some(text) = doc.text_content(node_id) { - result.push_str(text); - } - - for child in doc.children_vec(node_id) { - collect_text(doc, child, result); +/// Convert an XPath value to a string, using document access for NodeSets. +/// +/// Per XPath 1.0 spec, the string-value of a node-set is the string-value +/// of the first node in document order. This requires document access to +/// extract actual text content (unlike `XPathValue::to_string_value()`). +fn resolve_string(val: &XPathValue, doc: &D) -> String { + match val { + XPathValue::NodeSet(nodes) => { + if let Some(&first) = nodes.first() { + dom::node_string_value(doc, first) + } else { + String::new() + } + } + _ => val.to_string_value(), } } @@ -401,32 +417,35 @@ mod tests { #[test] fn test_concat() { + let doc = XmlDocument::parse(b""); let args = vec![ XPathValue::String("hello".to_string()), XPathValue::String(" ".to_string()), XPathValue::String("world".to_string()), ]; - let result = fn_concat(args).unwrap(); + let result = fn_concat(args, &doc).unwrap(); assert_eq!(result.to_string_value(), "hello world"); } #[test] fn test_contains() { + let doc = XmlDocument::parse(b""); let args = vec![ XPathValue::String("hello world".to_string()), XPathValue::String("world".to_string()), ]; - assert!(fn_contains(args).unwrap().to_boolean()); + assert!(fn_contains(args, &doc).unwrap().to_boolean()); } #[test] fn test_substring() { + let doc = XmlDocument::parse(b""); let args = vec![ XPathValue::String("hello".to_string()), XPathValue::Number(2.0), XPathValue::Number(3.0), ]; - let result = fn_substring(args).unwrap(); + let result = fn_substring(args, &doc).unwrap(); assert_eq!(result.to_string_value(), "ell"); } diff --git a/native/rustyxml/src/xpath/value.rs b/native/rustyxml/src/xpath/value.rs index 7162850..d35a55f 100644 --- a/native/rustyxml/src/xpath/value.rs +++ b/native/rustyxml/src/xpath/value.rs @@ -68,17 +68,20 @@ impl XPathValue { } } - /// Convert to string (XPath string() function semantics) + /// Convert to string (XPath string() function semantics). + /// + /// **Warning:** For `NodeSet` values, this returns an empty string because + /// proper XPath 1.0 string conversion requires document access to extract + /// text content. Use `dom::node_string_value()` directly when you have a + /// document reference and need the string-value of a node. pub fn to_string_value(&self) -> String { match self { - XPathValue::NodeSet(nodes) => { - // String value of first node in document order - // TODO: get actual text content from document - if nodes.is_empty() { - String::new() - } else { - format!("[node:{}]", nodes[0]) - } + XPathValue::NodeSet(_) => { + // XPath 1.0 spec: string-value of a node-set is the string-value + // of the first node in document order. We cannot resolve this without + // document access, so return empty string. Callers with doc access + // should use dom::node_string_value() instead. + String::new() } XPathValue::Boolean(b) => if *b { "true" } else { "false" }.to_string(), XPathValue::Number(n) => { From 45c511004029cb7e008034045efa7d862fbf00bb Mon Sep 17 00:00:00 2001 From: jeffhuen <32542276+jeffhuen@users.noreply.github.com> Date: Mon, 16 Feb 2026 12:37:49 -0800 Subject: [PATCH 3/3] Eliminate panic risk in NIF paths, fix NodeSet relational comparisons MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace expect() in XPath parser peek() with match + defensive Eof fallback — eliminates BEAM VM crash risk if invariant is violated - Replace expect() in XPath lexer read_string() with unwrap_or defensive fallback — same rationale - Thread document access into compare_numbers() so relational operators (< <= > >=) properly resolve NodeSet text content before numeric conversion — fixes silent wrong results for expressions like /r/price > 10 where 42.5 - Add resolve_number() helper mirroring resolve_string() pattern - Add test for relational operators on NodeSets Co-Authored-By: Claude Opus 4.6 --- native/rustyxml/src/xpath/eval.rs | 59 +++++++++++++++++++++++++---- native/rustyxml/src/xpath/lexer.rs | 7 ++-- native/rustyxml/src/xpath/parser.rs | 10 +++-- 3 files changed, 60 insertions(+), 16 deletions(-) diff --git a/native/rustyxml/src/xpath/eval.rs b/native/rustyxml/src/xpath/eval.rs index f9d275d..5728097 100644 --- a/native/rustyxml/src/xpath/eval.rs +++ b/native/rustyxml/src/xpath/eval.rs @@ -9,7 +9,7 @@ use super::parser::BinaryOp; use super::value::XPathValue; #[cfg(test)] use crate::dom::XmlDocument; -use crate::dom::{DocumentAccess, NodeId}; +use crate::dom::{self, DocumentAccess, NodeId}; use std::collections::HashSet; /// Evaluation context - generic over document type @@ -270,10 +270,10 @@ pub fn evaluate_compiled<'a, D: DocumentAccess>( BinaryOp::And => XPathValue::Boolean(left.to_boolean() && right.to_boolean()), BinaryOp::Eq => compare_values(ctx.doc, &left, &right, |a, b| a == b), BinaryOp::NotEq => compare_values(ctx.doc, &left, &right, |a, b| a != b), - BinaryOp::Lt => compare_numbers(&left, &right, |a, b| a < b), - BinaryOp::LtEq => compare_numbers(&left, &right, |a, b| a <= b), - BinaryOp::Gt => compare_numbers(&left, &right, |a, b| a > b), - BinaryOp::GtEq => compare_numbers(&left, &right, |a, b| a >= b), + BinaryOp::Lt => compare_numbers(ctx.doc, &left, &right, |a, b| a < b), + BinaryOp::LtEq => compare_numbers(ctx.doc, &left, &right, |a, b| a <= b), + BinaryOp::Gt => compare_numbers(ctx.doc, &left, &right, |a, b| a > b), + BinaryOp::GtEq => compare_numbers(ctx.doc, &left, &right, |a, b| a >= b), BinaryOp::Add => XPathValue::Number(left.to_number() + right.to_number()), BinaryOp::Sub => XPathValue::Number(left.to_number() - right.to_number()), BinaryOp::Mul => XPathValue::Number(left.to_number() * right.to_number()), @@ -367,12 +367,40 @@ where } } -/// Compare two values as numbers -fn compare_numbers(left: &XPathValue, right: &XPathValue, cmp: F) -> XPathValue +/// Compare two values as numbers (for relational operators: <, <=, >, >=). +/// +/// Requires document access to resolve NodeSet values to their text content +/// before numeric conversion, per XPath 1.0 spec. +fn compare_numbers( + doc: &D, + left: &XPathValue, + right: &XPathValue, + cmp: F, +) -> XPathValue where F: Fn(f64, f64) -> bool, { - XPathValue::Boolean(cmp(left.to_number(), right.to_number())) + let ln = resolve_number(doc, left); + let rn = resolve_number(doc, right); + XPathValue::Boolean(cmp(ln, rn)) +} + +/// Convert an XPath value to a number, using document access for NodeSets. +/// +/// Per XPath 1.0 spec, the number-value of a node-set is the number-value +/// of the string-value of the first node in document order. +fn resolve_number(doc: &D, val: &XPathValue) -> f64 { + match val { + XPathValue::NodeSet(nodes) => { + if let Some(&first) = nodes.first() { + let s = dom::node_string_value(doc, first); + s.trim().parse().unwrap_or(f64::NAN) + } else { + f64::NAN + } + } + _ => val.to_number(), + } } #[cfg(test)] @@ -464,4 +492,19 @@ mod tests { "contains() should work with NodeSet first arg" ); } + + #[test] + fn relational_operator_on_nodeset_resolves_text_content() { + let doc = XmlDocument::parse(b"42.5"); + let result = evaluate(&doc, "/r/price > 10").unwrap(); + assert!( + result.to_boolean(), + "NodeSet with numeric text '42.5' should be > 10" + ); + let result = evaluate(&doc, "/r/price < 100").unwrap(); + assert!( + result.to_boolean(), + "NodeSet with numeric text '42.5' should be < 100" + ); + } } diff --git a/native/rustyxml/src/xpath/lexer.rs b/native/rustyxml/src/xpath/lexer.rs index 4e2d933..1078e8b 100644 --- a/native/rustyxml/src/xpath/lexer.rs +++ b/native/rustyxml/src/xpath/lexer.rs @@ -256,10 +256,9 @@ impl<'a> Lexer<'a> { /// Read a string literal fn read_string(&mut self) -> Token { - // Safe: read_string is only called when peek() matched a quote char - let quote = self - .peek() - .expect("read_string called only when peek matched a quote"); + // Caller guarantees peek() matched a quote char. + // Use unwrap_or with '"' as defensive fallback to avoid panicking in NIF paths. + let quote = self.peek().unwrap_or('"'); self.advance(1); // Skip opening quote let start = self.pos; diff --git a/native/rustyxml/src/xpath/parser.rs b/native/rustyxml/src/xpath/parser.rs index c09cfac..6f33fb1 100644 --- a/native/rustyxml/src/xpath/parser.rs +++ b/native/rustyxml/src/xpath/parser.rs @@ -159,10 +159,12 @@ impl<'a> Parser<'a> { if self.peeked.is_none() { self.peeked = Some(self.lexer.next_token()); } - // Safe: the if-branch above guarantees peeked is Some - self.peeked - .as_ref() - .expect("peeked guaranteed Some by preceding assignment") + // Structurally guaranteed Some by the if-branch above. + // Match instead of expect() to avoid panicking in NIF paths. + match self.peeked.as_ref() { + Some(token) => token, + None => &Token::Eof, + } } /// Parse expression (handles union)