diff --git a/src/earley/mod.rs b/src/earley/mod.rs index 1d80378..d550a9c 100644 --- a/src/earley/mod.rs +++ b/src/earley/mod.rs @@ -2,36 +2,47 @@ mod input_range; mod traversal; use crate::parser::grammar::ParseGrammar; -use crate::{ParseTree, ParseTreeNode, Term, tracing}; +use crate::{GrammarParser, ParseTree, ParseTreeNode, Term, tracing}; use input_range::InputRange; use std::collections::{BTreeSet, HashSet, VecDeque}; use std::rc::Rc; use traversal::{TermMatch, Traversal, TraversalId, TraversalTree}; pub fn parse<'gram>( - grammar: &'gram crate::Grammar, + grammar: &'gram GrammarParser<'gram>, input: &'gram str, + starting_term: Option<&'gram Term>, ) -> impl Iterator> { - ParseTreeIter::new(grammar, input) + ParseTreeIter::new(ParserHold::Borrowed(grammar), input, starting_term) } -pub fn parse_starting_with<'gram>( - grammar: &'gram crate::Grammar, +/// Parse using an owned parser (e.g. from deprecated `Grammar::parse_input`). +/// The iterator holds `Rc` to keep the parser alive. +pub fn parse_with_parser_rc<'gram>( + parser: Rc>, input: &'gram str, - starting_term: &'gram Term, + starting_term: Option<&'gram Term>, ) -> impl Iterator> { - ParseTreeIter::new_starting_with(grammar, input, starting_term) + ParseTreeIter::new(ParserHold::Owned(parser), input, starting_term) } -/// Parse input using a pre-built `ParseGrammar`, starting with the given term. -/// This allows reusing the `ParseGrammar` for multiple inputs. -pub(crate) fn parse_starting_with_grammar<'gram>( - parse_grammar: &Rc>, - input: &'gram str, - starting_term: &'gram Term, -) -> impl Iterator> { - // Clone the Rc (just increments reference count, no data copying) - ParseTreeIter::new_starting_with_grammar(Rc::clone(parse_grammar), input, starting_term) +/// Holds either a borrowed or owned parser so the iterator can keep it alive when needed. +/// +/// Only required for the deprecated `Grammar::parse_input` and `Grammar::parse_input_starting_with` methods. +/// Prefer `GrammarParser::parse_input` and `GrammarParser::parse_input_starting_with` instead. +#[derive(Debug)] +enum ParserHold<'gram> { + Borrowed(&'gram GrammarParser<'gram>), + Owned(Rc>), +} + +impl<'gram> ParserHold<'gram> { + fn as_ref(&self) -> &GrammarParser<'gram> { + match self { + ParserHold::Borrowed(p) => p, + ParserHold::Owned(rc) => rc.as_ref(), + } + } } /// A queue of [`TraversalId`] for processing, with repetitions ignored. @@ -169,45 +180,35 @@ fn earley<'gram>( #[derive(Debug)] struct ParseTreeIter<'gram> { + parser: ParserHold<'gram>, traversal_tree: TraversalTree<'gram>, - grammar: Rc>, queue: TraversalQueue, completions: CompletionMap<'gram>, } impl<'gram> ParseTreeIter<'gram> { - pub fn new(grammar: &'gram crate::Grammar, input: &'gram str) -> Self { - let starting_term = grammar - .starting_term() - .expect("Grammar must have one production to parse"); - - Self::new_starting_with(grammar, input, starting_term) - } - - pub fn new_starting_with( - grammar: &'gram crate::Grammar, + pub fn new( + parser: ParserHold<'gram>, input: &'gram str, - starting_term: &'gram Term, + starting_term: Option<&'gram Term>, ) -> Self { - let parse_grammar = Rc::new(ParseGrammar::new(grammar)); - Self::new_starting_with_grammar(parse_grammar, input, starting_term) - } - - pub(crate) fn new_starting_with_grammar( - parse_grammar: Rc>, - input: &'gram str, - starting_term: &'gram Term, - ) -> Self { - let input = InputRange::new(input); + let input_range = InputRange::new(input); let mut traversal_tree = TraversalTree::default(); let mut queue = TraversalQueue::default(); let completions = CompletionMap::default(); - - queue.push_back_starting(&mut traversal_tree, &parse_grammar, starting_term, &input); + let parser_ref = parser.as_ref(); + let starting_term = starting_term.unwrap_or(parser_ref.starting_term); + + queue.push_back_starting( + &mut traversal_tree, + parser_ref.parse_grammar.as_ref(), + starting_term, + &input_range, + ); Self { traversal_tree, - grammar: parse_grammar, + parser, queue, completions, } @@ -220,13 +221,14 @@ impl<'gram> Iterator for ParseTreeIter<'gram> { let Self { queue, completions, - grammar, + parser, traversal_tree, } = self; + let parse_grammar = &parser.as_ref().parse_grammar; - earley(queue, traversal_tree, completions, grammar).map(|traversal_id| { + earley(queue, traversal_tree, completions, parse_grammar).map(|traversal_id| { let _span = tracing::span!(tracing::Level::DEBUG, "next_parse_tree").entered(); - let parse_tree = parse_tree(traversal_tree, grammar, traversal_id); + let parse_tree = parse_tree(traversal_tree, parse_grammar, traversal_id); tracing::event!(tracing::Level::TRACE, "\n{parse_tree}"); parse_tree }) @@ -338,8 +340,9 @@ mod tests { fn prop_empty_rules_allow_parse(grammar: NestedEmptyGrammar) -> TestResult { let input = "a"; + let parser = GrammarParser::new(&grammar.0).unwrap(); - let mut parses = parse(&grammar.0, input); + let mut parses = parse(&parser, input, None); TestResult::from_bool(parses.next().is_some()) } diff --git a/src/earley/traversal.rs b/src/earley/traversal.rs index b29f826..eb1e4aa 100644 --- a/src/earley/traversal.rs +++ b/src/earley/traversal.rs @@ -240,7 +240,7 @@ mod tests { grammar: &'a Grammar, input: &'static str, ) -> (ParseGrammar<'a>, InputRange<'static>, TraversalTree<'a>) { - let matching = ParseGrammar::new(grammar); + let matching = ParseGrammar::new(grammar).unwrap(); let input = InputRange::new(input); let tree = TraversalTree::default(); diff --git a/src/grammar.rs b/src/grammar.rs index b346b82..b691e92 100644 --- a/src/grammar.rs +++ b/src/grammar.rs @@ -45,6 +45,7 @@ use rand::{Rng, SeedableRng, rng, rngs::StdRng, seq::IndexedRandom}; use serde::{Deserialize, Serialize}; use std::fmt::{self, Write}; +use std::rc::Rc; use std::str; /// A node of a `ParseTree`, either terminating or continuing the `ParseTree` @@ -69,7 +70,7 @@ impl<'gram> ParseTree<'gram> { } // A set of column indices, used for tracking which columns are active when formatting a `ParseTree` -type ParseTreeFormatSet = std::collections::HashSet; +type ParseTreeFormatSet = crate::HashSet; impl<'gram> ParseTree<'gram> { fn fmt( @@ -344,11 +345,10 @@ impl Grammar { /// Remove `Production` from the `Grammar` pub fn remove_production(&mut self, prod: &Production) -> Option { - if let Some(pos) = self.productions.iter().position(|x| *x == *prod) { - Some(self.productions.remove(pos)) - } else { - None - } + self.productions + .iter() + .position(|x| *x == *prod) + .map(|pos| self.productions.swap_remove(pos)) } /// Get iterator of the `Grammar`'s `Production`s @@ -361,6 +361,40 @@ impl Grammar { self.productions.iter_mut() } + /// Validate the `Grammar` has no undefined nonterminals + /// + /// No need to call this method before building a parser, as the parser will validate the grammar at construction time. + /// + /// # Errors + /// + /// Returns `Error::ValidationError` if the grammar has no productions or has undefined nonterminals. + pub fn validate(&self) -> Result<(), Error> { + if self.productions.is_empty() { + return Err(Error::ValidationError( + "Grammar must have at least one production".to_string(), + )); + } + let mut sets = crate::validation::NonterminalSets::new(); + for production in self.productions_iter() { + if let Term::Nonterminal(nt) = &production.lhs { + sets.record_lhs(nt.as_str()); + } + for expression in production.rhs_iter() { + for term in expression.terms_iter() { + if let Term::Nonterminal(nt) = term { + sets.record_rhs(nt.as_str()); + } + } + } + } + if let Some(undefined) = sets.undefined().next() { + return Err(Error::ValidationError(format!( + "Undefined nonterminals: <{undefined}>" + ))); + } + Ok(()) + } + /// Build a reusable parser from this grammar, validating that all nonterminals are defined. /// /// This method validates the grammar and creates a [`crate::GrammarParser`] that can be @@ -412,7 +446,8 @@ impl Grammar { &'gram self, input: &'gram str, ) -> impl Iterator> { - crate::earley::parse(self, input) + let parser = Rc::new(crate::GrammarParser::new_unchecked(self)); + crate::earley::parse_with_parser_rc(parser, input, None) } /// Parse input strings according to `Grammar`, starting with given production @@ -440,7 +475,8 @@ impl Grammar { input: &'gram str, starting_term: &'gram Term, ) -> impl Iterator> { - crate::earley::parse_starting_with(self, input, starting_term) + let parser = Rc::new(crate::GrammarParser::new_unchecked(self)); + crate::earley::parse_with_parser_rc(parser, input, Some(starting_term)) } /// Get the starting term @@ -938,6 +974,48 @@ mod tests { ); } + #[test] + fn validate_fails_for_empty_grammar() { + let grammar = Grammar::from_parts(vec![]); + let result = grammar.validate(); + assert!(result.is_err(), "validate should fail for empty grammar"); + assert!(matches!(result.unwrap_err(), Error::ValidationError(_))); + } + + #[test] + fn validate_succeeds_for_valid_grammar() { + let grammar: Grammar = " ::= | + ::= 'a' + ::= 'b'" + .parse() + .unwrap(); + assert!(grammar.validate().is_ok()); + } + + #[test] + fn validate_fails_for_undefined_nonterminal() { + let grammar: Grammar = " ::= | + ::= 'a'" + .parse() + .unwrap(); + let result = grammar.validate(); + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), Error::ValidationError(_))); + } + + #[test] + fn validate_error_message_contains_undefined_nonterminal() { + let grammar: Grammar = " ::= ".parse().unwrap(); + let err = grammar.validate().unwrap_err(); + let Error::ValidationError(msg) = err else { + panic!("expected ValidationError"); + }; + assert!( + msg.contains(""), + "message should mention undefined nonterminal: {msg}" + ); + } + #[test] fn parse_error() { let grammar: Result = " ::= ::= 'c'" .parse() .unwrap(); - let valid: std::collections::HashSet = ["a", "b", "ac", "bc"] + let valid: crate::HashSet = ["a", "b", "ac", "bc"] .into_iter() .map(String::from) .collect(); diff --git a/src/lib.rs b/src/lib.rs index edbc74e..d0f3e74 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,6 +10,7 @@ mod parsers; mod production; mod term; mod tracing; +mod validation; pub use crate::error::Error; pub use crate::expression::Expression; pub use crate::grammar::{Grammar, ParseTree, ParseTreeNode, escape_mermaid_label}; @@ -25,3 +26,4 @@ pub use parsers::ABNF; pub use parsers::{BNF, Format}; pub(crate) use hashbrown::HashMap; +pub(crate) use hashbrown::HashSet; diff --git a/src/parser/grammar.rs b/src/parser/grammar.rs index 31e5d27..9e079de 100644 --- a/src/parser/grammar.rs +++ b/src/parser/grammar.rs @@ -1,4 +1,5 @@ use crate::append_vec::{AppendOnlyVec, append_only_vec_id}; +use crate::error::Error; use crate::tracing; append_only_vec_id!(pub(crate) ProductionId); @@ -23,26 +24,75 @@ pub(crate) struct ParseGrammar<'gram> { } impl<'gram, 'a> ParseGrammar<'gram> { - pub fn new(grammar: &'gram crate::Grammar) -> Self { + /// Build a `ParseGrammar` from a `Grammar`, validating that all nonterminals + /// referenced in productions have definitions. + /// + /// # Errors + /// + /// Returns `Error::ValidationError` if any nonterminal used in the RHS of + /// productions lacks a definition in the grammar. + pub fn new(grammar: &'gram crate::Grammar) -> Result { let _span = tracing::span!(tracing::Level::DEBUG, "ParseGrammar_new").entered(); let mut productions = AppendOnlyVec::::new(); let mut prods_by_lhs = ProdTermMap::new(); + let mut sets = crate::validation::NonterminalSets::new(); let flat_prod_iter = grammar .productions_iter() .flat_map(|prod| prod.rhs_iter().map(|rhs| (&prod.lhs, rhs))); for (lhs, rhs) in flat_prod_iter { + if let crate::Term::Nonterminal(nt) = lhs { + sets.record_lhs(nt.as_str()); + } + for term in rhs.terms_iter() { + if let crate::Term::Nonterminal(nt) = term { + sets.record_rhs(nt.as_str()); + } + } let prod = productions.push_with_id(|id| Production { id, lhs, rhs }); let id = prod.id; prods_by_lhs.entry(lhs).or_default().push(id); } + + if let Some(undefined) = sets.undefined().next() { + let message = format!("Undefined nonterminals: <{undefined}>"); + return Err(Error::ValidationError(message)); + } + + Ok(Self { + prods_by_lhs, + productions, + }) + } + + /// Build a `ParseGrammar` from a `Grammar` without validating that all + /// referenced nonterminals are defined. Used only by deprecated + /// `Grammar::parse_input` / `parse_input_starting_with` to preserve + /// pre-validation behavior. + pub(crate) fn new_unchecked(grammar: &'gram crate::Grammar) -> Self { + let _span = tracing::span!(tracing::Level::DEBUG, "ParseGrammar_new_unchecked").entered(); + + let mut productions = AppendOnlyVec::::new(); + let mut prods_by_lhs = ProdTermMap::new(); + + let flat_prod_iter = grammar + .productions_iter() + .flat_map(|prod| prod.rhs_iter().map(|rhs| (&prod.lhs, rhs))); + + for (lhs, rhs) in flat_prod_iter { + let prod = productions.push_with_id(|id| Production { id, lhs, rhs }); + let id = prod.id; + prods_by_lhs.entry(lhs).or_default().push(id); + } + Self { prods_by_lhs, productions, } } + pub fn get_production_by_id(&'a self, prod_id: ProductionId) -> &'a Production<'gram> { self.productions.get(prod_id).expect("valid production ID") } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 82c8e12..5db602c 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -5,7 +5,6 @@ use crate::error::Error; use crate::grammar::Grammar; use crate::term::Term; use grammar::ParseGrammar; -use std::collections::HashSet; use std::rc::Rc; /// A reusable parser built from a `Grammar` that validates all nonterminals are defined @@ -27,8 +26,8 @@ use std::rc::Rc; /// ``` #[derive(Debug)] pub struct GrammarParser<'gram> { - starting_term: &'gram Term, - parse_grammar: Rc>, + pub(crate) starting_term: &'gram Term, + pub(crate) parse_grammar: Rc>, } impl<'gram> GrammarParser<'gram> { @@ -40,83 +39,51 @@ impl<'gram> GrammarParser<'gram> { /// Returns `Error::ValidationError` if any nonterminal used in the RHS of /// productions lacks a definition in the grammar. pub fn new(grammar: &'gram Grammar) -> Result { - validate_nonterminals(grammar)?; let starting_term = grammar.starting_term().ok_or_else(|| { Error::ValidationError("Grammar must have at least one production".to_string()) })?; - let parse_grammar = Rc::new(ParseGrammar::new(grammar)); + let parse_grammar = Rc::new(ParseGrammar::new(grammar)?); Ok(Self { starting_term, parse_grammar, }) } + /// Construct a parser without validating that all nonterminals are defined. + /// Used only by deprecated `Grammar::parse_input` / `parse_input_starting_with`. + pub(crate) fn new_unchecked(grammar: &'gram Grammar) -> Self { + let starting_term = grammar + .starting_term() + .expect("Grammar must have at least one production"); + let parse_grammar = Rc::new(ParseGrammar::new_unchecked(grammar)); + Self { + starting_term, + parse_grammar, + } + } + /// Parse an input string using the grammar's starting nonterminal. /// /// Returns an iterator over all possible parse trees for the input. - pub fn parse_input(&self, input: &'gram str) -> impl Iterator> { + pub fn parse_input<'p: 'gram>( + &'p self, + input: &'gram str, + ) -> impl Iterator> + use<'p, 'gram> { self.parse_input_starting_with(input, self.starting_term) } /// Parse an input string starting with the given term (nonterminal or terminal). /// /// Returns an iterator over all possible parse trees for the input. - pub fn parse_input_starting_with( - &self, + pub fn parse_input_starting_with<'p: 'gram>( + &'p self, input: &'gram str, start: &'gram Term, - ) -> impl Iterator> { - crate::earley::parse_starting_with_grammar(&self.parse_grammar, input, start) + ) -> impl Iterator> + use<'p, 'gram> { + crate::earley::parse(self, input, Some(start)) } } -/// Validate that all nonterminals referenced in the grammar have definitions. -/// -/// # Errors -/// -/// Returns `Error::ValidationError` with a message listing all undefined nonterminals. -fn validate_nonterminals(grammar: &Grammar) -> Result<(), Error> { - // Collect all nonterminals defined in LHS of productions - let mut defined_nonterminals = HashSet::new(); - for production in grammar.productions_iter() { - if let Term::Nonterminal(ref nt) = production.lhs { - defined_nonterminals.insert(nt.clone()); - } - } - - // Collect all nonterminals used in RHS of all productions - let mut referenced_nonterminals = HashSet::new(); - for production in grammar.productions_iter() { - for expression in production.rhs_iter() { - for term in expression.terms_iter() { - if let Term::Nonterminal(nt) = term { - referenced_nonterminals.insert(nt.clone()); - } - } - } - } - - // Find undefined nonterminals - let undefined: Vec = referenced_nonterminals - .difference(&defined_nonterminals) - .cloned() - .collect(); - - if !undefined.is_empty() { - let message = format!( - "Undefined nonterminals: {}", - undefined - .iter() - .map(|nt| format!("<{nt}>")) - .collect::>() - .join(", ") - ); - return Err(Error::ValidationError(message)); - } - - Ok(()) -} - #[cfg(test)] mod tests { use super::*; @@ -412,42 +379,60 @@ mod tests { } } - // Helper: Generate grammar that may have undefined nonterminals + /// Generates a grammar that always has at least one undefined nonterminal + /// (referenced in a production RHS but never defined). + /// + /// Structure: `[nt0, nt1, ..., undefined0, undefined1, ...]` + /// - First `defined_count` nonterminals get productions + /// - Remaining "undefined" nonterminals are referenced but never defined + /// - We force at least one undefined reference so the grammar is invalid #[derive(Debug, Clone)] struct GrammarWithUndefined(Grammar); impl Arbitrary for GrammarWithUndefined { fn arbitrary(g: &mut Gen) -> Self { - let num_nonterms = usize::arbitrary(g) % 4 + 1; - let mut nonterms: Vec = (0..num_nonterms).map(|i| format!("nt{}", i)).collect(); - - // Add some undefined nonterminals - let num_undefined = usize::arbitrary(g) % 3; - for i in 0..num_undefined { - nonterms.push(format!("undefined{}", i)); - } + let defined_count = usize::arbitrary(g) % 4 + 1; + let num_undefined = usize::arbitrary(g) % 3 + 1; + + let defined_nonterms: Vec = + (0..defined_count).map(|i| format!("nt{}", i)).collect(); + let undefined_nonterms: Vec = (0..num_undefined) + .map(|i| format!("undefined{}", i)) + .collect(); + let all_nonterms: Vec = defined_nonterms + .iter() + .chain(undefined_nonterms.iter()) + .cloned() + .collect(); let mut productions = Vec::new(); - let defined_count = num_nonterms; - - for (idx, nt) in nonterms.iter().enumerate() { - if idx >= defined_count { - // Don't define the undefined nonterminals - continue; - } + let mut has_undefined_reference = false; + for (idx, nt) in defined_nonterms.iter().enumerate() { let mut expressions = Vec::new(); let num_alternatives = usize::arbitrary(g) % 2 + 1; - for _ in 0..num_alternatives { + for alt_idx in 0..num_alternatives { let mut terms = Vec::new(); let num_terms = usize::arbitrary(g) % 2 + 1; - for _ in 0..num_terms { - if bool::arbitrary(g) && !nonterms.is_empty() { - // Reference any nonterminal (may be undefined) - let ref_idx = usize::arbitrary(g) % nonterms.len(); - if let Some(nt) = nonterms.get(ref_idx) { - terms.push(Term::Nonterminal(nt.clone())); + // Invariant: first production's first alternative must reference undefined + let is_first_alt_of_first_prod = idx == 0 && alt_idx == 0; + let must_insert_undefined = + is_first_alt_of_first_prod && !has_undefined_reference; + + for term_idx in 0..num_terms { + let use_nonterminal = must_insert_undefined && term_idx == 0 + || (bool::arbitrary(g) && !all_nonterms.is_empty()); + + if use_nonterminal { + let ref_idx = if must_insert_undefined && term_idx == 0 { + has_undefined_reference = true; + defined_count + usize::arbitrary(g) % num_undefined + } else { + usize::arbitrary(g) % all_nonterms.len() + }; + if let Some(ref_nt) = all_nonterms.get(ref_idx) { + terms.push(Term::Nonterminal(ref_nt.clone())); } else { terms.push(Term::Terminal(String::arbitrary(g))); } @@ -472,42 +457,9 @@ mod tests { // Property test: Parser construction fails if any nonterminal lacks definition fn prop_parser_fails_with_undefined_nonterminal(grammar: GrammarWithUndefined) -> TestResult { let grammar = grammar.0; - - // Collect all nonterminals defined in LHS - let mut defined = std::collections::HashSet::new(); - for production in grammar.productions_iter() { - if let Term::Nonterminal(nt) = &production.lhs { - defined.insert(nt.clone()); - } - } - - // Collect all nonterminals used in RHS - let mut referenced = std::collections::HashSet::new(); - for production in grammar.productions_iter() { - for expression in production.rhs_iter() { - for term in expression.terms_iter() { - if let Term::Nonterminal(nt) = term { - referenced.insert(nt.clone()); - } - } - } - } - - // Find undefined nonterminals - let undefined: Vec<_> = referenced.difference(&defined).cloned().collect(); - - let parser_result = grammar.build_parser(); - - if undefined.is_empty() { - // All nonterminals are defined, parser should succeed - TestResult::from_bool(parser_result.is_ok()) - } else { - // Some nonterminals are undefined, parser should fail - TestResult::from_bool( - parser_result.is_err() - && matches!(parser_result.unwrap_err(), Error::ValidationError(_)), - ) - } + let parser = grammar.build_parser(); + let is_validation_error = matches!(parser, Err(Error::ValidationError(_))); + TestResult::from_bool(is_validation_error) } #[test] @@ -659,7 +611,7 @@ mod tests { let grammar = grammar.0; // Collect all nonterminals defined in LHS - let mut defined = std::collections::HashSet::new(); + let mut defined = crate::HashSet::new(); for production in grammar.productions_iter() { if let Term::Nonterminal(nt) = &production.lhs { defined.insert(nt.clone()); @@ -667,7 +619,7 @@ mod tests { } // Collect all nonterminals used in RHS - let mut referenced = std::collections::HashSet::new(); + let mut referenced = crate::HashSet::new(); for production in grammar.productions_iter() { for expression in production.rhs_iter() { for term in expression.terms_iter() { @@ -688,11 +640,11 @@ mod tests { TestResult::from_bool(undefined.is_empty()) } Err(Error::ValidationError(msg)) => { - // Parser failed, error message should mention all undefined nonterminals - let all_mentioned = undefined + // Parser failed, error message should mention at least one undefined nonterminal + let any_mentioned = undefined .iter() - .all(|nt| msg.contains(&format!("<{nt}>")) || msg.contains(nt)); - TestResult::from_bool(!undefined.is_empty() && all_mentioned) + .any(|nt| msg.contains(&format!("<{nt}>")) || msg.contains(nt)); + TestResult::from_bool(!undefined.is_empty() && any_mentioned) } Err(_) => TestResult::error("Expected ValidationError"), } diff --git a/src/parsers/mod.rs b/src/parsers/mod.rs index 48d8a2c..c677ac3 100644 --- a/src/parsers/mod.rs +++ b/src/parsers/mod.rs @@ -11,7 +11,6 @@ use crate::expression::Expression; use crate::grammar::Grammar; use crate::production::Production; use crate::term::Term; -use std::collections::HashSet; use nom::{ IResult, Parser, @@ -326,7 +325,7 @@ fn parsed_grammar_complete(input: &str) -> IResult<&str, ParsedGramma /// so they do not clash with any existing LHS nonterminal (e.g. user-defined `<__anon0>`). /// Optionals `[A / B]` are lowered to a fresh nonterminal with alternatives `A | B | ''`. fn normalize_parsed_grammar(parsed: ParsedGrammar) -> Grammar { - let mut used_names = HashSet::new(); + let mut used_names = crate::HashSet::new(); for prod in &parsed.productions { let ParsedProduction::Complex { lhs, .. } = prod; used_names.insert(lhs.clone()); @@ -336,7 +335,7 @@ fn normalize_parsed_grammar(parsed: ParsedGrammar) -> Grammar { let mut anon_prods = Vec::new(); /// Pick a fresh name that does not collide with user-defined LHS or other generated names. - fn fresh_anon_name(used: &mut HashSet, counter: &mut usize) -> String { + fn fresh_anon_name(used: &mut crate::HashSet, counter: &mut usize) -> String { loop { let candidate = format!("__anon{}", counter); *counter += 1; @@ -349,7 +348,7 @@ fn normalize_parsed_grammar(parsed: ParsedGrammar) -> Grammar { fn lower_expression( expr: ParsedExpression, - used: &mut HashSet, + used: &mut crate::HashSet, counter: &mut usize, anon_prods: &mut Vec, ) -> Expression { @@ -363,7 +362,7 @@ fn normalize_parsed_grammar(parsed: ParsedGrammar) -> Grammar { fn lower_term( term: ParsedTerm, - used: &mut HashSet, + used: &mut crate::HashSet, counter: &mut usize, anon_prods: &mut Vec, ) -> Term { diff --git a/src/validation.rs b/src/validation.rs new file mode 100644 index 0000000..4be6155 --- /dev/null +++ b/src/validation.rs @@ -0,0 +1,32 @@ +//! Shared logic for collecting LHS (defined) and RHS (referenced) nonterminals +//! and iterating over undefined ones. + +/// Records nonterminals that appear as LHS (defined) or RHS (referenced) in +/// productions; supports iterating over undefined nonterminals (referenced − defined). +#[derive(Debug, Default)] +pub(crate) struct NonterminalSets<'a> { + defined: crate::HashSet<&'a str>, + referenced: crate::HashSet<&'a str>, +} + +impl<'a> NonterminalSets<'a> { + pub(crate) fn new() -> Self { + Self { + defined: crate::HashSet::new(), + referenced: crate::HashSet::new(), + } + } + + pub(crate) fn record_lhs(&mut self, nt: &'a str) { + self.defined.insert(nt); + } + + pub(crate) fn record_rhs(&mut self, nt: &'a str) { + self.referenced.insert(nt); + } + + /// Iterator over nonterminals that are referenced but not defined. + pub(crate) fn undefined(&self) -> impl Iterator + '_ { + self.referenced.difference(&self.defined).copied() + } +} diff --git a/tests/parse_input.rs b/tests/parse_input.rs index c440b2a..940e83e 100644 --- a/tests/parse_input.rs +++ b/tests/parse_input.rs @@ -10,6 +10,7 @@ use std::sync::LazyLock; #[test] fn undefined_prod() { + // Grammar references but only defines ; validation should fail let grammar: Grammar = " ::= | ::= 'a' @@ -17,8 +18,25 @@ fn undefined_prod() { .parse() .unwrap(); - let input = "a"; + let parser_result = grammar.build_parser(); + assert!( + parser_result.is_err(), + "Parser should fail when grammar has undefined nonterminals" + ); +} +#[test] +fn undefined_prod_deprecated_parses() { + // Deprecated parse_input skips validation: grammar has undefined but still + // parses via the defined branch. + let grammar: Grammar = " + ::= | + ::= 'a' + " + .parse() + .unwrap(); + + let input = "a"; let parses: Vec<_> = grammar.parse_input(input).map(|a| a.to_string()).collect(); assert_snapshot!(parses.join("\n")); } diff --git a/tests/snapshots/parse_input__undefined_prod.snap b/tests/snapshots/parse_input__undefined_prod_deprecated_parses.snap similarity index 72% rename from tests/snapshots/parse_input__undefined_prod.snap rename to tests/snapshots/parse_input__undefined_prod_deprecated_parses.snap index a37805b..5a3501e 100644 --- a/tests/snapshots/parse_input__undefined_prod.snap +++ b/tests/snapshots/parse_input__undefined_prod_deprecated_parses.snap @@ -1,8 +1,7 @@ --- source: tests/parse_input.rs -expression: parses.next().unwrap() +expression: "parses.join(\"\\n\")" --- ::= └── ::= "a" └── "a" -