Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions native/rustyxml/src/core/unified_scanner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,7 @@ mod tests {

// The '<' should be emitted as text, then "1invalid/>" as more text
assert!(
handler.texts.len() >= 1,
!handler.texts.is_empty(),
"Invalid markup should produce text events"
);
assert_eq!(
Expand All @@ -650,7 +650,7 @@ mod tests {

// Should have text for "<1bad/>" and one valid element
assert!(
handler.texts.len() >= 1,
!handler.texts.is_empty(),
"Invalid markup should produce text"
);
assert_eq!(
Expand Down
60 changes: 3 additions & 57 deletions native/rustyxml/src/dom/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,24 +52,11 @@ impl<'a> XmlDocument<'a> {
doc
}

/// Parse an XML document in strict mode
pub fn parse_strict(input: &'a [u8]) -> Result<Self, String> {
let mut doc = XmlDocument {
input,
nodes: Vec::with_capacity(256),
attributes: Vec::with_capacity(128),
strings: StringPool::new(),
root_element: None,
};

doc.nodes.push(XmlNode::document());
doc.build_from_events(true)?;
Ok(doc)
}

/// Intern a Cow<[u8]> intelligently:
/// - If Borrowed (points into input): use intern_ref (zero-copy)
/// - If Owned (entity-decoded): use intern (copies to pool)
// Pattern-matches Borrowed vs Owned to choose zero-copy interning (intern_ref) or copying (intern)
#[expect(clippy::ptr_arg)]
#[inline]
fn intern_cow(&mut self, cow: &Cow<'_, [u8]>) -> u32 {
match cow {
Expand Down Expand Up @@ -432,26 +419,6 @@ impl<'a> XmlDocument<'a> {
self.strings.get_str_with_input(node.name_id, self.input)
}

/// Get node local name (without prefix)
pub fn node_local_name(&self, id: NodeId) -> Option<&str> {
let name = self.node_name(id)?;
if let Some(pos) = name.find(':') {
Some(&name[pos + 1..])
} else {
Some(name)
}
}

/// Get text content of a text node
pub fn text_content(&self, id: NodeId) -> Option<&str> {
let node = self.get_node(id)?;
if node.is_text() || node.kind == NodeKind::CData {
self.strings.get_str_with_input(node.name_id, self.input)
} else {
None
}
}

/// Get attributes for an element
pub fn attributes(&self, id: NodeId) -> &[XmlAttribute] {
if let Some(node) = self.get_node(id) {
Expand All @@ -463,28 +430,6 @@ impl<'a> XmlDocument<'a> {
}
}

/// Get attribute value by name
pub fn get_attribute(&self, node_id: NodeId, name: &str) -> Option<&str> {
for attr in self.attributes(node_id) {
if self.strings.get_str_with_input(attr.name_id, self.input) == Some(name) {
return self.strings.get_str_with_input(attr.value_id, self.input);
}
}
None
}

/// Get all attribute names and values for a node
pub fn get_attribute_values(&self, node_id: NodeId) -> Vec<(&str, &str)> {
self.attributes(node_id)
.iter()
.filter_map(|attr| {
let name = self.strings.get_str_with_input(attr.name_id, self.input)?;
let value = self.strings.get_str_with_input(attr.value_id, self.input)?;
Some((name, value))
})
.collect()
}

/// Iterate over children of a node
pub fn children(&self, id: NodeId) -> ChildIter<'_, 'a> {
let first = self.get_node(id).and_then(|n| n.first_child);
Expand Down Expand Up @@ -629,6 +574,7 @@ impl<'a> DocumentAccess for XmlDocument<'a> {
/// document structure, DTD) without allocating nodes, attributes, or a
/// string pool. This is the memory-efficient validation path used by
/// `parse_strict` before building the structural index.
#[must_use = "validation result should be checked"]
pub fn validate_strict(input: &[u8]) -> Result<(), String> {
let mut reader = SliceReader::new_strict(input);
let mut tag_stack: Vec<Vec<u8>> = vec![];
Expand Down
40 changes: 38 additions & 2 deletions native/rustyxml/src/dom/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@ pub mod node;
pub mod strings;

pub use document::validate_strict;
#[cfg(test)]
pub use node::XmlAttribute;
pub use node::{NodeId, NodeKind, XmlNode};

#[cfg(test)]
Expand Down Expand Up @@ -80,3 +78,41 @@ pub trait DocumentAccess {
0
}
}

/// Get the XPath string-value of a node per XPath 1.0 spec.
///
/// - For text/CDATA nodes: returns the text content
/// - For elements: concatenation of all descendant text nodes
/// - For other node types: empty string
pub fn node_string_value<D: DocumentAccess>(doc: &D, node_id: NodeId) -> String {
let kind = doc.node_kind_of(node_id);

match kind {
NodeKind::Text | NodeKind::CData => doc.text_content(node_id).unwrap_or("").to_string(),
NodeKind::Element => {
let mut result = String::new();
collect_descendant_text(doc, node_id, &mut result);
result
}
_ => String::new(),
}
}

/// Recursively collect text content from all descendant text nodes.
fn collect_descendant_text<D: DocumentAccess>(doc: &D, node_id: NodeId, result: &mut String) {
for child_id in doc.children_vec(node_id) {
let kind = doc.node_kind_of(child_id);

match kind {
NodeKind::Text | NodeKind::CData => {
if let Some(text) = doc.text_content(child_id) {
result.push_str(text);
}
}
NodeKind::Element => {
collect_descendant_text(doc, child_id, result);
}
_ => {}
}
}
}
5 changes: 5 additions & 0 deletions native/rustyxml/src/index/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ impl<'a> IndexBuilder<'a> {
// Build children from parent links
self.index.build_children_from_parents();

// Release over-allocated capacity from initial estimates.
// Estimates are based on input size heuristics and often over-allocate
// by 2-3x. This reclaims significant memory for long-lived documents.
self.index.shrink_to_fit();

// Debug output for structural index sizing (disabled by default)
// Enable by setting RUSTYXML_DEBUG_INDEX=1 environment variable
#[cfg(feature = "memory_tracking")]
Expand Down
56 changes: 9 additions & 47 deletions native/rustyxml/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ fn xpath_text_list<'a>(
Ok(XPathValue::NodeSet(nodes)) => {
let mut list = Term::list_new_empty(env);
for &id in nodes.iter().rev() {
let text = get_node_text_content(&view, id);
let text = dom::node_string_value(&view, id);
let binary = term::bytes_to_binary(env, text.as_bytes());
list = list.list_prepend(binary);
}
Expand Down Expand Up @@ -276,7 +276,7 @@ fn parse_and_xpath_text<'a>(
Ok(XPathValue::NodeSet(nodes)) => {
let mut list = Term::list_new_empty(env);
for &id in nodes.iter().rev() {
let text = get_node_text_content(&view, id);
let text = dom::node_string_value(&view, id);
let binary = term::bytes_to_binary(env, text.as_bytes());
list = list.list_prepend(binary);
}
Expand Down Expand Up @@ -365,7 +365,7 @@ fn xpath_string_value<'a>(env: Env<'a>, input: Binary<'a>, xpath_str: &str) -> N
xpath::XPathValue::Boolean(b) => b.to_string(),
xpath::XPathValue::NodeSet(nodes) => {
if let Some(&node_id) = nodes.first() {
get_node_text_content(&view, node_id)
dom::node_string_value(&view, node_id)
} else {
String::new()
}
Expand Down Expand Up @@ -394,7 +394,7 @@ fn xpath_string_value_doc<'a>(
xpath::XPathValue::Boolean(b) => b.to_string(),
xpath::XPathValue::NodeSet(nodes) => {
if let Some(&node_id) = nodes.first() {
get_node_text_content(&view, node_id)
dom::node_string_value(&view, node_id)
} else {
String::new()
}
Expand All @@ -407,47 +407,6 @@ fn xpath_string_value_doc<'a>(
}
}

/// Helper to get text content of a node
fn get_node_text_content<D: dom::DocumentAccess>(doc: &D, node_id: dom::NodeId) -> String {
use dom::NodeKind;

let kind = doc.node_kind_of(node_id);

match kind {
NodeKind::Text | NodeKind::CData => doc.text_content(node_id).unwrap_or("").to_string(),
NodeKind::Element => {
let mut result = String::new();
collect_text_content(doc, node_id, &mut result);
result
}
_ => String::new(),
}
}

/// Recursively collect text content from descendants
fn collect_text_content<D: dom::DocumentAccess>(
doc: &D,
node_id: dom::NodeId,
result: &mut String,
) {
use dom::NodeKind;

for child_id in doc.children_vec(node_id) {
let kind = doc.node_kind_of(child_id);

match kind {
NodeKind::Text | NodeKind::CData => {
if let Some(text) = doc.text_content(child_id) {
result.push_str(text);
}
}
NodeKind::Element => {
collect_text_content(doc, child_id, result);
}
_ => {}
}
}
}

// ============================================================================
// Streaming Parser
Expand Down Expand Up @@ -1352,8 +1311,11 @@ fn encode_attrs(buf: &mut BinaryWriter, input: &[u8], span: (usize, usize)) {
/// Return an empty BEAM binary.
#[inline]
fn empty_binary<'a>(env: Env<'a>) -> Term<'a> {
let owned = rustler::OwnedBinary::new(0).unwrap();
owned.release(env).encode(env)
// OwnedBinary::new(0) should never fail, but avoid panicking in a NIF.
match rustler::OwnedBinary::new(0) {
Some(owned) => owned.release(env).encode(env),
None => "".encode(env),
}
}

// ============================================================================
Expand Down
5 changes: 4 additions & 1 deletion native/rustyxml/src/resource.rs
Original file line number Diff line number Diff line change
Expand Up @@ -275,8 +275,11 @@ pub struct DocumentAccumulator {

impl DocumentAccumulator {
pub fn new() -> Self {
// Start with 4KB — enough for small documents, grows as needed.
// Previously 64KB which wasted memory for small documents and
// multiplied quickly across concurrent accumulators.
Self {
buffer: Mutex::new(Vec::with_capacity(64 * 1024)),
buffer: Mutex::new(Vec::with_capacity(4096)),
}
}

Expand Down
21 changes: 19 additions & 2 deletions native/rustyxml/src/strategy/streaming.rs
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,13 @@ impl StreamingParser {
// Remove processed bytes efficiently using drain (no reallocation needed,
// just moves remaining bytes to front)
self.buffer.drain(..boundary);

// Only shrink if excess capacity is significant (> 4x needed).
// Avoids grow-shrink-grow churn in active streaming while still
// reclaiming memory from large spikes in long-lived parsers.
if self.buffer.capacity() > 4 * self.buffer.len().max(8192) {
self.buffer.shrink_to(8192);
}
}

/// Process a slice of the buffer up to the given boundary
Expand Down Expand Up @@ -382,7 +389,12 @@ impl StreamingParser {
std::mem::take(&mut self.events)
} else {
// Partial take - use drain
self.events.drain(..count).collect()
let taken: Vec<_> = self.events.drain(..count).collect();
// Only shrink if excess capacity is significant (> 4x needed)
if self.events.capacity() > 4 * self.events.len().max(64) {
self.events.shrink_to(64);
}
taken
}
}

Expand All @@ -393,7 +405,12 @@ impl StreamingParser {
if count == self.complete_elements.len() {
std::mem::take(&mut self.complete_elements)
} else {
self.complete_elements.drain(..count).collect()
let taken: Vec<_> = self.complete_elements.drain(..count).collect();
// Only shrink if excess capacity is significant (> 4x needed)
if self.complete_elements.capacity() > 4 * self.complete_elements.len().max(16) {
self.complete_elements.shrink_to(16);
}
taken
}
}

Expand Down
23 changes: 14 additions & 9 deletions native/rustyxml/src/xpath/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@
use super::parser::{Axis, BinaryOp, Expr, NodeTest, Step};
use lru::LruCache;
use std::num::NonZeroUsize;
use std::sync::Mutex;
use std::sync::{Arc, Mutex};

/// Global LRU cache for compiled XPath expressions
/// Using a Mutex for thread-safety across BEAM schedulers
static XPATH_CACHE: Mutex<Option<LruCache<String, CompiledExpr>>> = Mutex::new(None);
/// Global LRU cache for compiled XPath expressions.
/// Using Arc<CompiledExpr> to avoid deep cloning on cache hits —
/// each hit is now a cheap Arc pointer bump instead of cloning
/// all Vec<Op>, Strings, and Box<CompiledExpr> recursively.
static XPATH_CACHE: Mutex<Option<LruCache<String, Arc<CompiledExpr>>>> = Mutex::new(None);

/// Cache capacity - tuned for typical XPath usage patterns
const CACHE_CAPACITY: usize = 256;
Expand Down Expand Up @@ -217,26 +219,29 @@ const CACHE_CAPACITY_NONZERO: NonZeroUsize = match NonZeroUsize::new(CACHE_CAPAC
None => panic!("CACHE_CAPACITY must be non-zero"),
};

/// Compile an XPath expression string (with caching)
pub fn compile(xpath: &str) -> Result<CompiledExpr, String> {
/// Compile an XPath expression string (with caching).
///
/// Returns `Arc<CompiledExpr>` — cache hits are a cheap pointer bump
/// instead of a deep clone of all operations, strings, and predicates.
pub fn compile(xpath: &str) -> Result<Arc<CompiledExpr>, String> {
// Try to get from cache first
if let Ok(mut guard) = XPATH_CACHE.lock() {
let cache = guard.get_or_insert_with(|| LruCache::new(CACHE_CAPACITY_NONZERO));

if let Some(compiled) = cache.get(xpath) {
return Ok(compiled.clone());
return Ok(Arc::clone(compiled));
}
}
// If mutex is poisoned, just skip the cache and compile directly

// Not in cache - parse and compile
let expr = super::parser::parse(xpath)?;
let compiled = CompiledExpr::compile(&expr);
let compiled = Arc::new(CompiledExpr::compile(&expr));

// Store in cache (if mutex is available)
if let Ok(mut guard) = XPATH_CACHE.lock() {
let cache = guard.get_or_insert_with(|| LruCache::new(CACHE_CAPACITY_NONZERO));
cache.put(xpath.to_string(), compiled.clone());
cache.put(xpath.to_string(), Arc::clone(&compiled));
}

Ok(compiled)
Expand Down
Loading