Skip to content

Commit f33c37a

Browse files
jeffhuenclaude
andauthored
Refactor XPath evaluation, fix memory leaks, and resolve all clippy warnings
* Fix memory leaks, connection overload, and Rust best practice issues Memory leak fixes: - StreamingParser: buffer, events, and complete_elements vecs never shrank after drain operations, causing unbounded growth in long-lived parsers. Added shrink_to() calls matching StreamingSaxParser behavior. - StructuralIndex: shrink_to_fit() existed but was never called after building. Initial capacity estimates over-allocate by 2-3x; now reclaimed immediately after build_children_from_parents(). - DocumentAccumulator: reduced default pre-allocation from 64KB to 4KB. The old 64KB multiplied quickly across concurrent accumulators. Connection/contention fix: - XPath cache: replaced deep CompiledExpr cloning with Arc<CompiledExpr>. Every cache hit previously cloned all Vec<Op>, Strings, and Box<CompiledExpr> recursively. Now it's a cheap Arc pointer bump. Correctness + memory fix: - compare_values in XPath eval: was using format!("{}", node_id) to compare nodes, which compared raw u32 IDs as strings — both wrong per XPath 1.0 spec and wasteful (O(n*m) String allocations). Now uses actual document text content via node_string_value(). - Consolidated duplicated get_node_text_content/collect_text_content from lib.rs into shared dom::node_string_value(). Safety fix: - empty_binary: replaced .unwrap() on OwnedBinary::new(0) with match to avoid potential NIF panic. Best practices: - Added #[must_use] to evaluate(), evaluate_from_node(), compile(), validate_strict(), and XPathValue type. https://claude.ai/code/session_015igpdCrNYKuoPrHWZ5RXYc * Fix XPath string-value semantics, optimize NodeSet comparison, resolve all clippy warnings - Fix XPathValue::to_string_value() to return empty string for NodeSets instead of misleading format!("[node:{}]", id) — callers with document access now use dom::node_string_value() per XPath 1.0 spec - Add resolve_string() helper to XPath functions for proper NodeSet-to-string conversion; pass document access to all 8 string functions - Replace duplicated get_string_value/collect_text with dom::node_string_value - Remove residual get_node_text_content wrapper from NIF layer - Optimize NodeSet-vs-NodeSet comparison from O(n*m) to O(n+m) by pre-computing right-side string values - Fix streaming parser shrink_to() reallocation churn with 4x threshold - Remove redundant #[must_use] on compile() (Result already has it) - Remove dead inherent methods on XmlDocument duplicated by DocumentAccess trait - Remove unused XmlAttribute re-export from dom/mod.rs - Fix len_zero clippy warnings in unified_scanner tests - Add #[expect(clippy::ptr_arg)] to intern_cow (needs Cow for zero-copy optimization) - Add 4 correctness tests for NodeSet equality and string function semantics Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Eliminate panic risk in NIF paths, fix NodeSet relational comparisons - Replace expect() in XPath parser peek() with match + defensive Eof fallback — eliminates BEAM VM crash risk if invariant is violated - Replace expect() in XPath lexer read_string() with unwrap_or defensive fallback — same rationale - Thread document access into compare_numbers() so relational operators (< <= > >=) properly resolve NodeSet text content before numeric conversion — fixes silent wrong results for expressions like /r/price > 10 where <price>42.5</price> - Add resolve_number() helper mirroring resolve_string() pattern - Add test for relational operators on NodeSets Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude <noreply@anthropic.com>
1 parent 6eeddf3 commit f33c37a

File tree

13 files changed

+325
-220
lines changed

13 files changed

+325
-220
lines changed

native/rustyxml/src/core/unified_scanner.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -630,7 +630,7 @@ mod tests {
630630

631631
// The '<' should be emitted as text, then "1invalid/>" as more text
632632
assert!(
633-
handler.texts.len() >= 1,
633+
!handler.texts.is_empty(),
634634
"Invalid markup should produce text events"
635635
);
636636
assert_eq!(
@@ -650,7 +650,7 @@ mod tests {
650650

651651
// Should have text for "<1bad/>" and one valid element
652652
assert!(
653-
handler.texts.len() >= 1,
653+
!handler.texts.is_empty(),
654654
"Invalid markup should produce text"
655655
);
656656
assert_eq!(

native/rustyxml/src/dom/document.rs

Lines changed: 3 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -52,24 +52,11 @@ impl<'a> XmlDocument<'a> {
5252
doc
5353
}
5454

55-
/// Parse an XML document in strict mode
56-
pub fn parse_strict(input: &'a [u8]) -> Result<Self, String> {
57-
let mut doc = XmlDocument {
58-
input,
59-
nodes: Vec::with_capacity(256),
60-
attributes: Vec::with_capacity(128),
61-
strings: StringPool::new(),
62-
root_element: None,
63-
};
64-
65-
doc.nodes.push(XmlNode::document());
66-
doc.build_from_events(true)?;
67-
Ok(doc)
68-
}
69-
7055
/// Intern a Cow<[u8]> intelligently:
7156
/// - If Borrowed (points into input): use intern_ref (zero-copy)
7257
/// - If Owned (entity-decoded): use intern (copies to pool)
58+
// Pattern-matches Borrowed vs Owned to choose zero-copy interning (intern_ref) or copying (intern)
59+
#[expect(clippy::ptr_arg)]
7360
#[inline]
7461
fn intern_cow(&mut self, cow: &Cow<'_, [u8]>) -> u32 {
7562
match cow {
@@ -432,26 +419,6 @@ impl<'a> XmlDocument<'a> {
432419
self.strings.get_str_with_input(node.name_id, self.input)
433420
}
434421

435-
/// Get node local name (without prefix)
436-
pub fn node_local_name(&self, id: NodeId) -> Option<&str> {
437-
let name = self.node_name(id)?;
438-
if let Some(pos) = name.find(':') {
439-
Some(&name[pos + 1..])
440-
} else {
441-
Some(name)
442-
}
443-
}
444-
445-
/// Get text content of a text node
446-
pub fn text_content(&self, id: NodeId) -> Option<&str> {
447-
let node = self.get_node(id)?;
448-
if node.is_text() || node.kind == NodeKind::CData {
449-
self.strings.get_str_with_input(node.name_id, self.input)
450-
} else {
451-
None
452-
}
453-
}
454-
455422
/// Get attributes for an element
456423
pub fn attributes(&self, id: NodeId) -> &[XmlAttribute] {
457424
if let Some(node) = self.get_node(id) {
@@ -463,28 +430,6 @@ impl<'a> XmlDocument<'a> {
463430
}
464431
}
465432

466-
/// Get attribute value by name
467-
pub fn get_attribute(&self, node_id: NodeId, name: &str) -> Option<&str> {
468-
for attr in self.attributes(node_id) {
469-
if self.strings.get_str_with_input(attr.name_id, self.input) == Some(name) {
470-
return self.strings.get_str_with_input(attr.value_id, self.input);
471-
}
472-
}
473-
None
474-
}
475-
476-
/// Get all attribute names and values for a node
477-
pub fn get_attribute_values(&self, node_id: NodeId) -> Vec<(&str, &str)> {
478-
self.attributes(node_id)
479-
.iter()
480-
.filter_map(|attr| {
481-
let name = self.strings.get_str_with_input(attr.name_id, self.input)?;
482-
let value = self.strings.get_str_with_input(attr.value_id, self.input)?;
483-
Some((name, value))
484-
})
485-
.collect()
486-
}
487-
488433
/// Iterate over children of a node
489434
pub fn children(&self, id: NodeId) -> ChildIter<'_, 'a> {
490435
let first = self.get_node(id).and_then(|n| n.first_child);
@@ -629,6 +574,7 @@ impl<'a> DocumentAccess for XmlDocument<'a> {
629574
/// document structure, DTD) without allocating nodes, attributes, or a
630575
/// string pool. This is the memory-efficient validation path used by
631576
/// `parse_strict` before building the structural index.
577+
#[must_use = "validation result should be checked"]
632578
pub fn validate_strict(input: &[u8]) -> Result<(), String> {
633579
let mut reader = SliceReader::new_strict(input);
634580
let mut tag_stack: Vec<Vec<u8>> = vec![];

native/rustyxml/src/dom/mod.rs

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@ pub mod node;
1212
pub mod strings;
1313

1414
pub use document::validate_strict;
15-
#[cfg(test)]
16-
pub use node::XmlAttribute;
1715
pub use node::{NodeId, NodeKind, XmlNode};
1816

1917
#[cfg(test)]
@@ -80,3 +78,41 @@ pub trait DocumentAccess {
8078
0
8179
}
8280
}
81+
82+
/// Get the XPath string-value of a node per XPath 1.0 spec.
83+
///
84+
/// - For text/CDATA nodes: returns the text content
85+
/// - For elements: concatenation of all descendant text nodes
86+
/// - For other node types: empty string
87+
pub fn node_string_value<D: DocumentAccess>(doc: &D, node_id: NodeId) -> String {
88+
let kind = doc.node_kind_of(node_id);
89+
90+
match kind {
91+
NodeKind::Text | NodeKind::CData => doc.text_content(node_id).unwrap_or("").to_string(),
92+
NodeKind::Element => {
93+
let mut result = String::new();
94+
collect_descendant_text(doc, node_id, &mut result);
95+
result
96+
}
97+
_ => String::new(),
98+
}
99+
}
100+
101+
/// Recursively collect text content from all descendant text nodes.
102+
fn collect_descendant_text<D: DocumentAccess>(doc: &D, node_id: NodeId, result: &mut String) {
103+
for child_id in doc.children_vec(node_id) {
104+
let kind = doc.node_kind_of(child_id);
105+
106+
match kind {
107+
NodeKind::Text | NodeKind::CData => {
108+
if let Some(text) = doc.text_content(child_id) {
109+
result.push_str(text);
110+
}
111+
}
112+
NodeKind::Element => {
113+
collect_descendant_text(doc, child_id, result);
114+
}
115+
_ => {}
116+
}
117+
}
118+
}

native/rustyxml/src/index/builder.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,11 @@ impl<'a> IndexBuilder<'a> {
5656
// Build children from parent links
5757
self.index.build_children_from_parents();
5858

59+
// Release over-allocated capacity from initial estimates.
60+
// Estimates are based on input size heuristics and often over-allocate
61+
// by 2-3x. This reclaims significant memory for long-lived documents.
62+
self.index.shrink_to_fit();
63+
5964
// Debug output for structural index sizing (disabled by default)
6065
// Enable by setting RUSTYXML_DEBUG_INDEX=1 environment variable
6166
#[cfg(feature = "memory_tracking")]

native/rustyxml/src/lib.rs

Lines changed: 9 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ fn xpath_text_list<'a>(
234234
Ok(XPathValue::NodeSet(nodes)) => {
235235
let mut list = Term::list_new_empty(env);
236236
for &id in nodes.iter().rev() {
237-
let text = get_node_text_content(&view, id);
237+
let text = dom::node_string_value(&view, id);
238238
let binary = term::bytes_to_binary(env, text.as_bytes());
239239
list = list.list_prepend(binary);
240240
}
@@ -276,7 +276,7 @@ fn parse_and_xpath_text<'a>(
276276
Ok(XPathValue::NodeSet(nodes)) => {
277277
let mut list = Term::list_new_empty(env);
278278
for &id in nodes.iter().rev() {
279-
let text = get_node_text_content(&view, id);
279+
let text = dom::node_string_value(&view, id);
280280
let binary = term::bytes_to_binary(env, text.as_bytes());
281281
list = list.list_prepend(binary);
282282
}
@@ -365,7 +365,7 @@ fn xpath_string_value<'a>(env: Env<'a>, input: Binary<'a>, xpath_str: &str) -> N
365365
xpath::XPathValue::Boolean(b) => b.to_string(),
366366
xpath::XPathValue::NodeSet(nodes) => {
367367
if let Some(&node_id) = nodes.first() {
368-
get_node_text_content(&view, node_id)
368+
dom::node_string_value(&view, node_id)
369369
} else {
370370
String::new()
371371
}
@@ -394,7 +394,7 @@ fn xpath_string_value_doc<'a>(
394394
xpath::XPathValue::Boolean(b) => b.to_string(),
395395
xpath::XPathValue::NodeSet(nodes) => {
396396
if let Some(&node_id) = nodes.first() {
397-
get_node_text_content(&view, node_id)
397+
dom::node_string_value(&view, node_id)
398398
} else {
399399
String::new()
400400
}
@@ -407,47 +407,6 @@ fn xpath_string_value_doc<'a>(
407407
}
408408
}
409409

410-
/// Helper to get text content of a node
411-
fn get_node_text_content<D: dom::DocumentAccess>(doc: &D, node_id: dom::NodeId) -> String {
412-
use dom::NodeKind;
413-
414-
let kind = doc.node_kind_of(node_id);
415-
416-
match kind {
417-
NodeKind::Text | NodeKind::CData => doc.text_content(node_id).unwrap_or("").to_string(),
418-
NodeKind::Element => {
419-
let mut result = String::new();
420-
collect_text_content(doc, node_id, &mut result);
421-
result
422-
}
423-
_ => String::new(),
424-
}
425-
}
426-
427-
/// Recursively collect text content from descendants
428-
fn collect_text_content<D: dom::DocumentAccess>(
429-
doc: &D,
430-
node_id: dom::NodeId,
431-
result: &mut String,
432-
) {
433-
use dom::NodeKind;
434-
435-
for child_id in doc.children_vec(node_id) {
436-
let kind = doc.node_kind_of(child_id);
437-
438-
match kind {
439-
NodeKind::Text | NodeKind::CData => {
440-
if let Some(text) = doc.text_content(child_id) {
441-
result.push_str(text);
442-
}
443-
}
444-
NodeKind::Element => {
445-
collect_text_content(doc, child_id, result);
446-
}
447-
_ => {}
448-
}
449-
}
450-
}
451410

452411
// ============================================================================
453412
// Streaming Parser
@@ -1352,8 +1311,11 @@ fn encode_attrs(buf: &mut BinaryWriter, input: &[u8], span: (usize, usize)) {
13521311
/// Return an empty BEAM binary.
13531312
#[inline]
13541313
fn empty_binary<'a>(env: Env<'a>) -> Term<'a> {
1355-
let owned = rustler::OwnedBinary::new(0).unwrap();
1356-
owned.release(env).encode(env)
1314+
// OwnedBinary::new(0) should never fail, but avoid panicking in a NIF.
1315+
match rustler::OwnedBinary::new(0) {
1316+
Some(owned) => owned.release(env).encode(env),
1317+
None => "".encode(env),
1318+
}
13571319
}
13581320

13591321
// ============================================================================

native/rustyxml/src/resource.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -275,8 +275,11 @@ pub struct DocumentAccumulator {
275275

276276
impl DocumentAccumulator {
277277
pub fn new() -> Self {
278+
// Start with 4KB — enough for small documents, grows as needed.
279+
// Previously 64KB which wasted memory for small documents and
280+
// multiplied quickly across concurrent accumulators.
278281
Self {
279-
buffer: Mutex::new(Vec::with_capacity(64 * 1024)),
282+
buffer: Mutex::new(Vec::with_capacity(4096)),
280283
}
281284
}
282285

native/rustyxml/src/strategy/streaming.rs

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,13 @@ impl StreamingParser {
173173
// Remove processed bytes efficiently using drain (no reallocation needed,
174174
// just moves remaining bytes to front)
175175
self.buffer.drain(..boundary);
176+
177+
// Only shrink if excess capacity is significant (> 4x needed).
178+
// Avoids grow-shrink-grow churn in active streaming while still
179+
// reclaiming memory from large spikes in long-lived parsers.
180+
if self.buffer.capacity() > 4 * self.buffer.len().max(8192) {
181+
self.buffer.shrink_to(8192);
182+
}
176183
}
177184

178185
/// Process a slice of the buffer up to the given boundary
@@ -382,7 +389,12 @@ impl StreamingParser {
382389
std::mem::take(&mut self.events)
383390
} else {
384391
// Partial take - use drain
385-
self.events.drain(..count).collect()
392+
let taken: Vec<_> = self.events.drain(..count).collect();
393+
// Only shrink if excess capacity is significant (> 4x needed)
394+
if self.events.capacity() > 4 * self.events.len().max(64) {
395+
self.events.shrink_to(64);
396+
}
397+
taken
386398
}
387399
}
388400

@@ -393,7 +405,12 @@ impl StreamingParser {
393405
if count == self.complete_elements.len() {
394406
std::mem::take(&mut self.complete_elements)
395407
} else {
396-
self.complete_elements.drain(..count).collect()
408+
let taken: Vec<_> = self.complete_elements.drain(..count).collect();
409+
// Only shrink if excess capacity is significant (> 4x needed)
410+
if self.complete_elements.capacity() > 4 * self.complete_elements.len().max(16) {
411+
self.complete_elements.shrink_to(16);
412+
}
413+
taken
397414
}
398415
}
399416

native/rustyxml/src/xpath/compiler.rs

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,13 @@
66
use super::parser::{Axis, BinaryOp, Expr, NodeTest, Step};
77
use lru::LruCache;
88
use std::num::NonZeroUsize;
9-
use std::sync::Mutex;
9+
use std::sync::{Arc, Mutex};
1010

11-
/// Global LRU cache for compiled XPath expressions
12-
/// Using a Mutex for thread-safety across BEAM schedulers
13-
static XPATH_CACHE: Mutex<Option<LruCache<String, CompiledExpr>>> = Mutex::new(None);
11+
/// Global LRU cache for compiled XPath expressions.
12+
/// Using Arc<CompiledExpr> to avoid deep cloning on cache hits —
13+
/// each hit is now a cheap Arc pointer bump instead of cloning
14+
/// all Vec<Op>, Strings, and Box<CompiledExpr> recursively.
15+
static XPATH_CACHE: Mutex<Option<LruCache<String, Arc<CompiledExpr>>>> = Mutex::new(None);
1416

1517
/// Cache capacity - tuned for typical XPath usage patterns
1618
const CACHE_CAPACITY: usize = 256;
@@ -217,26 +219,29 @@ const CACHE_CAPACITY_NONZERO: NonZeroUsize = match NonZeroUsize::new(CACHE_CAPAC
217219
None => panic!("CACHE_CAPACITY must be non-zero"),
218220
};
219221

220-
/// Compile an XPath expression string (with caching)
221-
pub fn compile(xpath: &str) -> Result<CompiledExpr, String> {
222+
/// Compile an XPath expression string (with caching).
223+
///
224+
/// Returns `Arc<CompiledExpr>` — cache hits are a cheap pointer bump
225+
/// instead of a deep clone of all operations, strings, and predicates.
226+
pub fn compile(xpath: &str) -> Result<Arc<CompiledExpr>, String> {
222227
// Try to get from cache first
223228
if let Ok(mut guard) = XPATH_CACHE.lock() {
224229
let cache = guard.get_or_insert_with(|| LruCache::new(CACHE_CAPACITY_NONZERO));
225230

226231
if let Some(compiled) = cache.get(xpath) {
227-
return Ok(compiled.clone());
232+
return Ok(Arc::clone(compiled));
228233
}
229234
}
230235
// If mutex is poisoned, just skip the cache and compile directly
231236

232237
// Not in cache - parse and compile
233238
let expr = super::parser::parse(xpath)?;
234-
let compiled = CompiledExpr::compile(&expr);
239+
let compiled = Arc::new(CompiledExpr::compile(&expr));
235240

236241
// Store in cache (if mutex is available)
237242
if let Ok(mut guard) = XPATH_CACHE.lock() {
238243
let cache = guard.get_or_insert_with(|| LruCache::new(CACHE_CAPACITY_NONZERO));
239-
cache.put(xpath.to_string(), compiled.clone());
244+
cache.put(xpath.to_string(), Arc::clone(&compiled));
240245
}
241246

242247
Ok(compiled)

0 commit comments

Comments
 (0)