From bed51f7c81feca9fd6b69d6d8dedc0d104e284b6 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Mon, 25 Nov 2024 01:42:29 -0700 Subject: [PATCH 01/51] feat(#230): map basic typst expressions to tokens --- Cargo.lock | 190 +++++++++++++++++++++++++++++++ harper-core/Cargo.toml | 1 + harper-core/src/parsers/mod.rs | 1 + harper-core/src/parsers/typst.rs | 170 +++++++++++++++++++++++++++ 4 files changed, 362 insertions(+) create mode 100644 harper-core/src/parsers/typst.rs diff --git a/Cargo.lock b/Cargo.lock index 1a2a5767..c263f4b9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -322,6 +322,31 @@ dependencies = [ "itertools 0.10.5", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + [[package]] name = "crunchy" version = "0.2.2" @@ -393,6 +418,15 @@ dependencies = [ "syn", ] +[[package]] +name = "ecow" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e42fc0a93992b20c58b99e59d61eaf1635a25bfbe49e4275c34ba0aee98119ba" +dependencies = [ + "serde", +] + [[package]] name = "either" version = "1.13.0" @@ -610,6 +644,7 @@ dependencies = [ "serde_json", "smallvec", "thiserror 2.0.4", + "typst-syntax", "unicode-blocks", "unicode-width 0.2.0", ] @@ -850,6 +885,16 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "indexmap" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +dependencies = [ + "equivalent", + "hashbrown 0.15.1", +] + [[package]] name = "is-docker" version = "0.2.0" @@ -1161,6 +1206,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "portable-atomic" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" + [[package]] name = "ppv-lite86" version = "0.2.20" @@ -1239,6 +1290,26 @@ dependencies = [ "serde", ] +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "redox_syscall" version = "0.5.7" @@ -1378,6 +1449,15 @@ dependencies = [ "syn", ] +[[package]] +name = "serde_spanned" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87607cb1398ed59d48732e575a4c28a7a8ebf2454b964fe3f224f2afc07909e1" +dependencies = [ + "serde", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -1387,6 +1467,12 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "siphasher" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" + [[package]] name = "slab" version = "0.4.9" @@ -1449,6 +1535,12 @@ dependencies = [ "syn", ] +[[package]] +name = "thin-vec" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a38c90d48152c236a3ab59271da4f4ae63d678c5d7ad6b7714d7cb9760be5e4b" + [[package]] name = "thiserror" version = "1.0.69" @@ -1559,6 +1651,40 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml" +version = "0.8.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1ed1f98e3fdc28d6d910e6737ae6ab1a93bf1985935a1193e68f93eeb68d24e" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "winnow", +] + [[package]] name = "tower" version = "0.4.13" @@ -1857,6 +1983,37 @@ dependencies = [ "tree-sitter", ] +[[package]] +name = "typst-syntax" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05b7be8b6ed6b2cb39ca495947d548a28d7db0ba244008e44c5a759120327693" +dependencies = [ + "ecow", + "once_cell", + "serde", + "toml", + "typst-utils", + "unicode-ident", + "unicode-math-class", + "unicode-script", + "unicode-segmentation", + "unscanny", +] + +[[package]] +name = "typst-utils" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f0305443ed97f0b658471487228f86bf835705e7525fbdcc671cebd864f7a40" +dependencies = [ + "once_cell", + "portable-atomic", + "rayon", + "siphasher", + "thin-vec", +] + [[package]] name = "unicase" version = "2.8.0" @@ -1875,6 +2032,24 @@ version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" +[[package]] +name = "unicode-math-class" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d246cf599d5fae3c8d56e04b20eb519adb89a8af8d0b0fbcded369aa3647d65" + +[[package]] +name = "unicode-script" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fb421b350c9aff471779e262955939f565ec18b86c15364e6bdf0d662ca7c1f" + +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + [[package]] name = "unicode-width" version = "0.1.14" @@ -1887,6 +2062,12 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" +[[package]] +name = "unscanny" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9df2af067a7953e9c3831320f35c1cc0600c30d44d9f7a12b01db1cd88d6b47" + [[package]] name = "url" version = "2.5.3" @@ -2173,6 +2354,15 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "winnow" +version = "0.6.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b" +dependencies = [ + "memchr", +] + [[package]] name = "write16" version = "1.0.0" diff --git a/harper-core/Cargo.toml b/harper-core/Cargo.toml index 1d5bdb0c..69f41a4c 100644 --- a/harper-core/Cargo.toml +++ b/harper-core/Cargo.toml @@ -24,6 +24,7 @@ thiserror = "2.0.4" unicode-blocks = "0.1.9" unicode-width = "0.2.0" levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } +typst-syntax = "0.12.0" [dev-dependencies] criterion = { version = "0.5.1", default-features = false } diff --git a/harper-core/src/parsers/mod.rs b/harper-core/src/parsers/mod.rs index f35f209b..08f53bbf 100644 --- a/harper-core/src/parsers/mod.rs +++ b/harper-core/src/parsers/mod.rs @@ -3,6 +3,7 @@ mod isolate_english; mod markdown; mod mask; mod plain_english; +mod typst; use blanket::blanket; pub use collapse_identifiers::CollapseIdentifiers; diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs new file mode 100644 index 00000000..9f5c6f85 --- /dev/null +++ b/harper-core/src/parsers/typst.rs @@ -0,0 +1,170 @@ +use itertools::Itertools; + +use typst_syntax::ast::{AstNode, Expr}; + +use super::{Parser, PlainEnglish}; +use crate::{parsers::StrParser, Token, TokenKind, WordMetadata}; + +/// A parser that wraps the [`PlainEnglish`] parser that allows one to parse +/// Typst files. +pub struct Typst; + +macro_rules! constant_token { + ($offset:ident, $doc:ident, $a:ident, $to:expr) => {{ + let range = $doc.range($a.span()).unwrap(); + *$offset += range.len(); + Some(vec![Token { + span: range.into(), + kind: $to, + }]) + }}; +} +macro_rules! recursive_env { + ($offset:ident, $expr:ident, $doc:ident, $parser:ident) => { + Some( + $expr + .body() + .exprs() + .filter_map(|e| map_token(e, $doc, $parser, $offset)) + .flatten() + .collect_vec(), + ) + }; +} + +fn map_token( + ex: typst_syntax::ast::Expr, + doc: &typst_syntax::Source, + parser: &mut PlainEnglish, + offset: &mut usize, +) -> Option> { + match ex { + Expr::Text(text) => Some( + parser + .parse_str(text.get()) + .into_iter() + .map(|mut t| { + t.span.push_by(*offset); + t + }) + .collect_vec(), + ), + Expr::Space(a) => constant_token!(offset, doc, a, TokenKind::Space(1)), + Expr::Linebreak(a) => constant_token!(offset, doc, a, TokenKind::Newline(1)), + Expr::Parbreak(a) => constant_token!(offset, doc, a, TokenKind::Newline(2)), + Expr::Escape(_) => None, + Expr::Shorthand(_) => None, + Expr::SmartQuote(_) => None, + Expr::Strong(strong) => recursive_env!(offset, strong, doc, parser), + Expr::Emph(emph) => recursive_env!(offset, emph, doc, parser), + Expr::Raw(_) => None, + Expr::Link(a) => constant_token!(offset, doc, a, TokenKind::Url), + Expr::Label(label) => Some( + parser + .parse_str(label.get()) + .into_iter() + .map(|mut t| { + t.span.push_by(*offset); + t + }) + .collect_vec(), + ), + Expr::Ref(a) => { + constant_token!(offset, doc, a, TokenKind::Word(WordMetadata::default())) + } + Expr::Heading(heading) => recursive_env!(offset, heading, doc, parser), + Expr::List(list_item) => recursive_env!(offset, list_item, doc, parser), + Expr::Enum(enum_item) => recursive_env!(offset, enum_item, doc, parser), + Expr::Term(term_item) => Some( + term_item + .term() + .exprs() + .chain(term_item.description().exprs()) + .filter_map(|e| map_token(e, doc, parser, offset)) + .flatten() + .collect_vec(), + ), + Expr::Equation(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), + Expr::Math(_) => None, + Expr::MathIdent(_) => None, + Expr::MathShorthand(_) => None, + Expr::MathAlignPoint(_) => None, + Expr::MathDelimited(_) => None, + Expr::MathAttach(_) => None, + Expr::MathPrimes(_) => None, + Expr::MathFrac(_) => None, + Expr::MathRoot(_) => None, + Expr::Ident(a) => constant_token!(offset, doc, a, TokenKind::Word(WordMetadata::default())), + Expr::None(a) => constant_token!(offset, doc, a, TokenKind::Word(WordMetadata::default())), + Expr::Auto(a) => constant_token!(offset, doc, a, TokenKind::Word(WordMetadata::default())), + Expr::Bool(a) => constant_token!(offset, doc, a, TokenKind::Word(WordMetadata::default())), + Expr::Int(int) => todo!(), + Expr::Float(float) => todo!(), + Expr::Numeric(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), + Expr::Str(text) => Some( + parser + .parse_str(text.get()) + .into_iter() + .map(|mut t| { + t.span.push_by(*offset); + t + }) + .collect_vec(), + ), + Expr::Code(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), + Expr::Content(content_block) => recursive_env!(offset, content_block, doc, parser), + Expr::Parenthesized(parenthesized) => map_token(parenthesized.expr(), doc, parser, offset), + Expr::Array(array) => Some( + array + .items() + .filter_map(|i| { + if let typst_syntax::ast::ArrayItem::Pos(e) = i { + map_token(e, doc, parser, offset) + } else { + None + } + }) + .flatten() + .collect_vec(), + ), + Expr::Dict(dict) => todo!(), + Expr::Unary(unary) => todo!(), + Expr::Binary(binary) => todo!(), + Expr::FieldAccess(field_access) => todo!(), + Expr::FuncCall(func_call) => todo!(), + Expr::Closure(closure) => todo!(), + Expr::Let(let_binding) => todo!(), + Expr::DestructAssign(destruct_assignment) => todo!(), + Expr::Set(set_rule) => todo!(), + Expr::Show(show_rule) => todo!(), + Expr::Contextual(contextual) => todo!(), + Expr::Conditional(conditional) => todo!(), + Expr::While(while_loop) => todo!(), + Expr::For(for_loop) => todo!(), + Expr::Import(module_import) => todo!(), + Expr::Include(module_include) => todo!(), + Expr::Break(loop_break) => todo!(), + Expr::Continue(loop_continue) => todo!(), + Expr::Return(func_return) => todo!(), + } +} + +impl Parser for Typst { + fn parse(&mut self, source: &[char]) -> Vec { + let mut english_parser = PlainEnglish; + + let source_str: String = source.iter().collect(); + let typst_document = typst_syntax::Source::detached(source_str); + let typst_tree = typst_syntax::ast::Markup::from_untyped(typst_document.root()) + .expect("Unable to create typst document from parsed tree!"); + let mut offset = 0; + + // NOTE: the range spits out __byte__ indices, not char indices. + // This is why we keep track above. + typst_tree + .exprs() + .filter_map(|ex| map_token(ex, &typst_document, &mut english_parser, &mut offset)) + .flatten() + .collect_vec() + } +} From 7201575d5f2080f8bcb576f07f8345bce55b8800 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Mon, 25 Nov 2024 11:29:57 -0700 Subject: [PATCH 02/51] feat(#230): change recursive shorthand from macro to function --- harper-core/src/parsers/typst.rs | 54 +++++++++++++++++--------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index 9f5c6f85..4f7e27b9 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -1,6 +1,6 @@ use itertools::Itertools; -use typst_syntax::ast::{AstNode, Expr}; +use typst_syntax::ast::{AstNode, Expr, Markup}; use super::{Parser, PlainEnglish}; use crate::{parsers::StrParser, Token, TokenKind, WordMetadata}; @@ -19,17 +19,19 @@ macro_rules! constant_token { }]) }}; } -macro_rules! recursive_env { - ($offset:ident, $expr:ident, $doc:ident, $parser:ident) => { - Some( - $expr - .body() - .exprs() - .filter_map(|e| map_token(e, $doc, $parser, $offset)) - .flatten() - .collect_vec(), - ) - }; + +fn recursive_env( + exprs: &mut dyn Iterator, + doc: &typst_syntax::Source, + parser: &mut PlainEnglish, + offset: &mut usize, +) -> Option> { + Some( + exprs + .filter_map(|e| map_token(e, doc, parser, offset)) + .flatten() + .collect_vec(), + ) } fn map_token( @@ -55,8 +57,8 @@ fn map_token( Expr::Escape(_) => None, Expr::Shorthand(_) => None, Expr::SmartQuote(_) => None, - Expr::Strong(strong) => recursive_env!(offset, strong, doc, parser), - Expr::Emph(emph) => recursive_env!(offset, emph, doc, parser), + Expr::Strong(strong) => recursive_env(&mut strong.body().exprs(), doc, parser, offset), + Expr::Emph(emph) => recursive_env(&mut emph.body().exprs(), doc, parser, offset), Expr::Raw(_) => None, Expr::Link(a) => constant_token!(offset, doc, a, TokenKind::Url), Expr::Label(label) => Some( @@ -72,17 +74,17 @@ fn map_token( Expr::Ref(a) => { constant_token!(offset, doc, a, TokenKind::Word(WordMetadata::default())) } - Expr::Heading(heading) => recursive_env!(offset, heading, doc, parser), - Expr::List(list_item) => recursive_env!(offset, list_item, doc, parser), - Expr::Enum(enum_item) => recursive_env!(offset, enum_item, doc, parser), - Expr::Term(term_item) => Some( - term_item + Expr::Heading(heading) => recursive_env(&mut heading.body().exprs(), doc, parser, offset), + Expr::List(list_item) => recursive_env(&mut list_item.body().exprs(), doc, parser, offset), + Expr::Enum(enum_item) => recursive_env(&mut enum_item.body().exprs(), doc, parser, offset), + Expr::Term(term_item) => recursive_env( + &mut term_item .term() .exprs() - .chain(term_item.description().exprs()) - .filter_map(|e| map_token(e, doc, parser, offset)) - .flatten() - .collect_vec(), + .chain(term_item.description().exprs()), + doc, + parser, + offset, ), Expr::Equation(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), Expr::Math(_) => None, @@ -112,7 +114,9 @@ fn map_token( .collect_vec(), ), Expr::Code(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), - Expr::Content(content_block) => recursive_env!(offset, content_block, doc, parser), + Expr::Content(content_block) => { + recursive_env(&mut content_block.body().exprs(), doc, parser, offset) + } Expr::Parenthesized(parenthesized) => map_token(parenthesized.expr(), doc, parser, offset), Expr::Array(array) => Some( array @@ -155,7 +159,7 @@ impl Parser for Typst { let source_str: String = source.iter().collect(); let typst_document = typst_syntax::Source::detached(source_str); - let typst_tree = typst_syntax::ast::Markup::from_untyped(typst_document.root()) + let typst_tree = Markup::from_untyped(typst_document.root()) .expect("Unable to create typst document from parsed tree!"); let mut offset = 0; From d880613dc56e5670063db3dfa1e47b68836c1643 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Mon, 25 Nov 2024 23:12:02 -0700 Subject: [PATCH 03/51] feat(#230): flesh out more complicated typst syntax parsing --- harper-core/src/parsers/typst.rs | 260 +++++++++++++++++++++++-------- 1 file changed, 195 insertions(+), 65 deletions(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index 4f7e27b9..6c15494e 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -3,14 +3,14 @@ use itertools::Itertools; use typst_syntax::ast::{AstNode, Expr, Markup}; use super::{Parser, PlainEnglish}; -use crate::{parsers::StrParser, Token, TokenKind, WordMetadata}; +use crate::{parsers::StrParser, Punctuation, Token, TokenKind, WordMetadata}; /// A parser that wraps the [`PlainEnglish`] parser that allows one to parse /// Typst files. pub struct Typst; macro_rules! constant_token { - ($offset:ident, $doc:ident, $a:ident, $to:expr) => {{ + ($offset:ident, $doc:ident, $a:expr, $to:expr) => {{ let range = $doc.range($a.span()).unwrap(); *$offset += range.len(); Some(vec![Token { @@ -20,6 +20,18 @@ macro_rules! constant_token { }}; } +macro_rules! merge_expr { + ($($inner:expr),*) => { + Some( + [$($inner),*] + .into_iter() + .flatten() + .flatten() + .collect_vec(), + ) + }; +} + fn recursive_env( exprs: &mut dyn Iterator, doc: &typst_syntax::Source, @@ -28,12 +40,33 @@ fn recursive_env( ) -> Option> { Some( exprs - .filter_map(|e| map_token(e, doc, parser, offset)) + .filter_map(|e| { + let range = doc.range(e.span()).unwrap(); + *offset += range.len(); + map_token(e, doc, parser, offset) + }) .flatten() .collect_vec(), ) } +fn parse_english( + str: impl Into, + parser: &mut PlainEnglish, + offset: &mut usize, +) -> Option> { + let res = parser + .parse_str(str.into()) + .into_iter() + .map(|mut t| { + t.span.push_by(*offset); + t + }) + .collect_vec(); + *offset = res.last()?.span.end - 1; + Some(res) +} + fn map_token( ex: typst_syntax::ast::Expr, doc: &typst_syntax::Source, @@ -41,36 +74,34 @@ fn map_token( offset: &mut usize, ) -> Option> { match ex { - Expr::Text(text) => Some( - parser - .parse_str(text.get()) - .into_iter() - .map(|mut t| { - t.span.push_by(*offset); - t - }) - .collect_vec(), - ), + Expr::Text(text) => parse_english(text.get(), parser, offset), Expr::Space(a) => constant_token!(offset, doc, a, TokenKind::Space(1)), Expr::Linebreak(a) => constant_token!(offset, doc, a, TokenKind::Newline(1)), Expr::Parbreak(a) => constant_token!(offset, doc, a, TokenKind::Newline(2)), - Expr::Escape(_) => None, - Expr::Shorthand(_) => None, - Expr::SmartQuote(_) => None, + Expr::Escape(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), + Expr::Shorthand(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), + Expr::SmartQuote(quote) => { + if quote.double() { + constant_token!( + offset, + doc, + quote, + TokenKind::Punctuation(Punctuation::Quote(crate::Quote { twin_loc: None })) + ) + } else { + constant_token!( + offset, + doc, + quote, + TokenKind::Punctuation(Punctuation::Apostrophe) + ) + } + } Expr::Strong(strong) => recursive_env(&mut strong.body().exprs(), doc, parser, offset), Expr::Emph(emph) => recursive_env(&mut emph.body().exprs(), doc, parser, offset), - Expr::Raw(_) => None, + Expr::Raw(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), Expr::Link(a) => constant_token!(offset, doc, a, TokenKind::Url), - Expr::Label(label) => Some( - parser - .parse_str(label.get()) - .into_iter() - .map(|mut t| { - t.span.push_by(*offset); - t - }) - .collect_vec(), - ), + Expr::Label(label) => parse_english(label.get(), parser, offset), Expr::Ref(a) => { constant_token!(offset, doc, a, TokenKind::Word(WordMetadata::default())) } @@ -87,15 +118,15 @@ fn map_token( offset, ), Expr::Equation(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), - Expr::Math(_) => None, - Expr::MathIdent(_) => None, - Expr::MathShorthand(_) => None, - Expr::MathAlignPoint(_) => None, - Expr::MathDelimited(_) => None, - Expr::MathAttach(_) => None, - Expr::MathPrimes(_) => None, - Expr::MathFrac(_) => None, - Expr::MathRoot(_) => None, + Expr::Math(_) => panic!("Unexpected math outside equation environment."), + Expr::MathIdent(_) => panic!("Unexpected math outside equation environment."), + Expr::MathShorthand(_) => panic!("Unexpected math outside equation environment."), + Expr::MathAlignPoint(_) => panic!("Unexpected math outside equation environment."), + Expr::MathDelimited(_) => panic!("Unexpected math outside equation environment."), + Expr::MathAttach(_) => panic!("Unexpected math outside equation environment."), + Expr::MathPrimes(_) => panic!("Unexpected math outside equation environment."), + Expr::MathFrac(_) => panic!("Unexpected math outside equation environment."), + Expr::MathRoot(_) => panic!("Unexpected math outside equation environment."), Expr::Ident(a) => constant_token!(offset, doc, a, TokenKind::Word(WordMetadata::default())), Expr::None(a) => constant_token!(offset, doc, a, TokenKind::Word(WordMetadata::default())), Expr::Auto(a) => constant_token!(offset, doc, a, TokenKind::Word(WordMetadata::default())), @@ -103,16 +134,7 @@ fn map_token( Expr::Int(int) => todo!(), Expr::Float(float) => todo!(), Expr::Numeric(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), - Expr::Str(text) => Some( - parser - .parse_str(text.get()) - .into_iter() - .map(|mut t| { - t.span.push_by(*offset); - t - }) - .collect_vec(), - ), + Expr::Str(text) => parse_english(text.get(), parser, offset), Expr::Code(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), Expr::Content(content_block) => { recursive_env(&mut content_block.body().exprs(), doc, parser, offset) @@ -131,25 +153,64 @@ fn map_token( .flatten() .collect_vec(), ), - Expr::Dict(dict) => todo!(), - Expr::Unary(unary) => todo!(), - Expr::Binary(binary) => todo!(), - Expr::FieldAccess(field_access) => todo!(), - Expr::FuncCall(func_call) => todo!(), - Expr::Closure(closure) => todo!(), - Expr::Let(let_binding) => todo!(), - Expr::DestructAssign(destruct_assignment) => todo!(), - Expr::Set(set_rule) => todo!(), - Expr::Show(show_rule) => todo!(), - Expr::Contextual(contextual) => todo!(), - Expr::Conditional(conditional) => todo!(), - Expr::While(while_loop) => todo!(), - Expr::For(for_loop) => todo!(), - Expr::Import(module_import) => todo!(), - Expr::Include(module_include) => todo!(), - Expr::Break(loop_break) => todo!(), - Expr::Continue(loop_continue) => todo!(), - Expr::Return(func_return) => todo!(), + // TODO: actually parse dictionaries + Expr::Dict(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), + Expr::Unary(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), + Expr::Binary(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), + Expr::FieldAccess(field_access) => merge_expr!( + map_token(field_access.target(), doc, parser, offset), + constant_token!( + offset, + doc, + field_access.field(), + TokenKind::Word(WordMetadata::default()) + ) + ), + Expr::FuncCall(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), + Expr::Closure(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), + Expr::Let(let_binding) => let_binding + .init() + .and_then(|e| map_token(e, doc, parser, offset)), + Expr::DestructAssign(destruct_assignment) => { + map_token(destruct_assignment.value(), doc, parser, offset) + } + Expr::Set(set_rule) => merge_expr!( + map_token(set_rule.target(), doc, parser, offset), + map_token(set_rule.condition()?, doc, parser, offset) + ), + Expr::Show(show_rule) => merge_expr!( + map_token(show_rule.transform(), doc, parser, offset), + map_token(show_rule.selector()?, doc, parser, offset) + ), + Expr::Contextual(contextual) => map_token(contextual.body(), doc, parser, offset), + Expr::Conditional(conditional) => merge_expr!( + map_token(conditional.condition(), doc, parser, offset), + map_token(conditional.if_body(), doc, parser, offset), + map_token(conditional.else_body()?, doc, parser, offset) + ), + Expr::While(while_loop) => merge_expr!( + map_token(while_loop.condition(), doc, parser, offset), + map_token(while_loop.body(), doc, parser, offset) + ), + Expr::For(for_loop) => merge_expr!( + map_token(for_loop.iterable(), doc, parser, offset), + map_token(for_loop.body(), doc, parser, offset) + ), + Expr::Import(module_import) => { + merge_expr!( + map_token(module_import.source(), doc, parser, offset), + constant_token!( + offset, + doc, + module_import.new_name()?, + TokenKind::Word(WordMetadata::default()) + ) + ) + } + Expr::Include(module_include) => map_token(module_include.source(), doc, parser, offset), + Expr::Break(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), + Expr::Continue(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), + Expr::Return(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), } } @@ -172,3 +233,72 @@ impl Parser for Typst { .collect_vec() } } + +#[cfg(test)] +mod tests { + use super::Typst; + use crate::{parsers::StrParser, Punctuation, TokenKind}; + + #[test] + fn conjunction() { + let source = r"doesn't"; + + let tokens = Typst.parse_str(source); + + let token_kinds = tokens.iter().map(|t| t.kind).collect::>(); + + dbg!(&token_kinds); + + assert!(matches!(token_kinds.as_slice(), &[TokenKind::Word(_),])) + } + + #[test] + fn sentence() { + let source = r"This is a sentence, it does not have any particularly interesting elements of the typst syntax."; + + let tokens = Typst.parse_str(source); + + let token_kinds = tokens.iter().map(|t| t.kind).collect::>(); + + dbg!(&token_kinds); + + assert!(matches!( + token_kinds.as_slice(), + &[ + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Punctuation(Punctuation::Comma), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Punctuation(Punctuation::Period), + ] + )) + } +} From 9376e71e8abf8d869a6071817c62cb9538435ed1 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Mon, 25 Nov 2024 23:25:17 -0700 Subject: [PATCH 04/51] feat(#230): delegate typst files to parser in harper-cli and harper-ls --- harper-cli/src/main.rs | 4 +++- harper-core/src/parsers/mod.rs | 1 + harper-ls/src/backend.rs | 6 +++++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/harper-cli/src/main.rs b/harper-cli/src/main.rs index 73b7d45d..1a3e3639 100644 --- a/harper-cli/src/main.rs +++ b/harper-cli/src/main.rs @@ -7,7 +7,7 @@ use ariadne::{Color, Label, Report, ReportKind, Source}; use clap::Parser; use harper_comments::CommentParser; use harper_core::linting::{LintGroup, LintGroupConfig, Linter}; -use harper_core::parsers::Markdown; +use harper_core::parsers::{Markdown, Typst}; use harper_core::{remove_overlaps, Dictionary, Document, FstDictionary}; #[derive(Debug, Parser)] @@ -107,6 +107,8 @@ fn load_file(file: &Path) -> anyhow::Result<(Document, String)> { let mut parser: Box = if let Some("md") = file.extension().map(|v| v.to_str().unwrap()) { Box::new(Markdown) + } else if let Some("typ") = file.extension().map(|v| v.to_str().unwrap()) { + Box::new(Typst) } else { Box::new( CommentParser::new_from_filename(file) diff --git a/harper-core/src/parsers/mod.rs b/harper-core/src/parsers/mod.rs index 08f53bbf..af742a3e 100644 --- a/harper-core/src/parsers/mod.rs +++ b/harper-core/src/parsers/mod.rs @@ -11,6 +11,7 @@ pub use isolate_english::IsolateEnglish; pub use markdown::Markdown; pub use mask::Mask; pub use plain_english::PlainEnglish; +pub use typst::Typst; pub use crate::token::{Token, TokenKind, TokenStringExt}; diff --git a/harper-ls/src/backend.rs b/harper-ls/src/backend.rs index 73ceedeb..7357d4f2 100644 --- a/harper-ls/src/backend.rs +++ b/harper-ls/src/backend.rs @@ -5,7 +5,9 @@ use std::sync::Arc; use anyhow::anyhow; use harper_comments::CommentParser; use harper_core::linting::{LintGroup, Linter}; -use harper_core::parsers::{CollapseIdentifiers, IsolateEnglish, Markdown, Parser, PlainEnglish}; +use harper_core::parsers::{ + CollapseIdentifiers, IsolateEnglish, Markdown, Parser, PlainEnglish, Typst, +}; use harper_core::{ Dictionary, Document, FstDictionary, FullDictionary, MergedDictionary, Token, TokenKind, WordMetadata, @@ -206,6 +208,8 @@ impl Backend { } } else if language_id == "markdown" { Some(Box::new(Markdown)) + } else if language_id == "typst" { + Some(Box::new(Typst)) } else if language_id == "git-commit" { Some(Box::new(GitCommitParser)) } else if language_id == "html" { From 749e6dd47366edfa14d04104a6a88e66d975a136 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Mon, 25 Nov 2024 23:32:28 -0700 Subject: [PATCH 05/51] fix(#230): fix offset update after delegating parser --- harper-core/src/parsers/typst.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index 6c15494e..3a894112 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -63,7 +63,7 @@ fn parse_english( t }) .collect_vec(); - *offset = res.last()?.span.end - 1; + *offset = res.last()?.span.end; Some(res) } From ba3c3070f62ced7b6173a608e6f46ff22b1c20ae Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Mon, 25 Nov 2024 23:58:30 -0700 Subject: [PATCH 06/51] fix(#230): ParBreak to ParBreak, not two Newlines --- harper-core/src/parsers/typst.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index 3a894112..8d9b5e02 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -77,7 +77,7 @@ fn map_token( Expr::Text(text) => parse_english(text.get(), parser, offset), Expr::Space(a) => constant_token!(offset, doc, a, TokenKind::Space(1)), Expr::Linebreak(a) => constant_token!(offset, doc, a, TokenKind::Newline(1)), - Expr::Parbreak(a) => constant_token!(offset, doc, a, TokenKind::Newline(2)), + Expr::Parbreak(a) => constant_token!(offset, doc, a, TokenKind::ParagraphBreak), Expr::Escape(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), Expr::Shorthand(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), Expr::SmartQuote(quote) => { From c6a4d05ed83dab35876006f0353b9e0c0fddf1e7 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Tue, 26 Nov 2024 16:58:26 -0700 Subject: [PATCH 07/51] feat(#230): remove offset variable, and just use the start of an environment's span --- harper-core/src/parsers/typst.rs | 138 +++++++++++++------------------ 1 file changed, 59 insertions(+), 79 deletions(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index 8d9b5e02..8edfe427 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -10,11 +10,9 @@ use crate::{parsers::StrParser, Punctuation, Token, TokenKind, WordMetadata}; pub struct Typst; macro_rules! constant_token { - ($offset:ident, $doc:ident, $a:expr, $to:expr) => {{ - let range = $doc.range($a.span()).unwrap(); - *$offset += range.len(); + ($doc:ident, $a:expr, $to:expr) => {{ Some(vec![Token { - span: range.into(), + span: $doc.range($a.span()).unwrap().into(), kind: $to, }]) }}; @@ -36,15 +34,10 @@ fn recursive_env( exprs: &mut dyn Iterator, doc: &typst_syntax::Source, parser: &mut PlainEnglish, - offset: &mut usize, ) -> Option> { Some( exprs - .filter_map(|e| { - let range = doc.range(e.span()).unwrap(); - *offset += range.len(); - map_token(e, doc, parser, offset) - }) + .filter_map(|e| map_token(e, doc, parser)) .flatten() .collect_vec(), ) @@ -52,18 +45,18 @@ fn recursive_env( fn parse_english( str: impl Into, + doc: &typst_syntax::Source, parser: &mut PlainEnglish, - offset: &mut usize, + span: &typst_syntax::Span, ) -> Option> { let res = parser .parse_str(str.into()) .into_iter() .map(|mut t| { - t.span.push_by(*offset); + t.span.push_by(doc.range(*span).unwrap().start); t }) .collect_vec(); - *offset = res.last()?.span.end; Some(res) } @@ -71,43 +64,36 @@ fn map_token( ex: typst_syntax::ast::Expr, doc: &typst_syntax::Source, parser: &mut PlainEnglish, - offset: &mut usize, ) -> Option> { match ex { - Expr::Text(text) => parse_english(text.get(), parser, offset), - Expr::Space(a) => constant_token!(offset, doc, a, TokenKind::Space(1)), - Expr::Linebreak(a) => constant_token!(offset, doc, a, TokenKind::Newline(1)), - Expr::Parbreak(a) => constant_token!(offset, doc, a, TokenKind::ParagraphBreak), - Expr::Escape(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), - Expr::Shorthand(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), + Expr::Text(text) => parse_english(text.get(), doc, parser, &text.span()), + Expr::Space(a) => constant_token!(doc, a, TokenKind::Space(1)), + Expr::Linebreak(a) => constant_token!(doc, a, TokenKind::Newline(1)), + Expr::Parbreak(a) => constant_token!(doc, a, TokenKind::ParagraphBreak), + Expr::Escape(a) => constant_token!(doc, a, TokenKind::Unlintable), + Expr::Shorthand(a) => constant_token!(doc, a, TokenKind::Unlintable), Expr::SmartQuote(quote) => { if quote.double() { constant_token!( - offset, doc, quote, TokenKind::Punctuation(Punctuation::Quote(crate::Quote { twin_loc: None })) ) } else { - constant_token!( - offset, - doc, - quote, - TokenKind::Punctuation(Punctuation::Apostrophe) - ) + constant_token!(doc, quote, TokenKind::Punctuation(Punctuation::Apostrophe)) } } - Expr::Strong(strong) => recursive_env(&mut strong.body().exprs(), doc, parser, offset), - Expr::Emph(emph) => recursive_env(&mut emph.body().exprs(), doc, parser, offset), - Expr::Raw(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), - Expr::Link(a) => constant_token!(offset, doc, a, TokenKind::Url), - Expr::Label(label) => parse_english(label.get(), parser, offset), + Expr::Strong(strong) => recursive_env(&mut strong.body().exprs(), doc, parser), + Expr::Emph(emph) => recursive_env(&mut emph.body().exprs(), doc, parser), + Expr::Raw(a) => constant_token!(doc, a, TokenKind::Unlintable), + Expr::Link(a) => constant_token!(doc, a, TokenKind::Url), + Expr::Label(label) => parse_english(label.get(), doc, parser, &label.span()), Expr::Ref(a) => { - constant_token!(offset, doc, a, TokenKind::Word(WordMetadata::default())) + constant_token!(doc, a, TokenKind::Word(WordMetadata::default())) } - Expr::Heading(heading) => recursive_env(&mut heading.body().exprs(), doc, parser, offset), - Expr::List(list_item) => recursive_env(&mut list_item.body().exprs(), doc, parser, offset), - Expr::Enum(enum_item) => recursive_env(&mut enum_item.body().exprs(), doc, parser, offset), + Expr::Heading(heading) => recursive_env(&mut heading.body().exprs(), doc, parser), + Expr::List(list_item) => recursive_env(&mut list_item.body().exprs(), doc, parser), + Expr::Enum(enum_item) => recursive_env(&mut enum_item.body().exprs(), doc, parser), Expr::Term(term_item) => recursive_env( &mut term_item .term() @@ -115,9 +101,8 @@ fn map_token( .chain(term_item.description().exprs()), doc, parser, - offset, ), - Expr::Equation(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), + Expr::Equation(a) => constant_token!(doc, a, TokenKind::Unlintable), Expr::Math(_) => panic!("Unexpected math outside equation environment."), Expr::MathIdent(_) => panic!("Unexpected math outside equation environment."), Expr::MathShorthand(_) => panic!("Unexpected math outside equation environment."), @@ -127,25 +112,25 @@ fn map_token( Expr::MathPrimes(_) => panic!("Unexpected math outside equation environment."), Expr::MathFrac(_) => panic!("Unexpected math outside equation environment."), Expr::MathRoot(_) => panic!("Unexpected math outside equation environment."), - Expr::Ident(a) => constant_token!(offset, doc, a, TokenKind::Word(WordMetadata::default())), - Expr::None(a) => constant_token!(offset, doc, a, TokenKind::Word(WordMetadata::default())), - Expr::Auto(a) => constant_token!(offset, doc, a, TokenKind::Word(WordMetadata::default())), - Expr::Bool(a) => constant_token!(offset, doc, a, TokenKind::Word(WordMetadata::default())), + Expr::Ident(a) => constant_token!(doc, a, TokenKind::Word(WordMetadata::default())), + Expr::None(a) => constant_token!(doc, a, TokenKind::Word(WordMetadata::default())), + Expr::Auto(a) => constant_token!(doc, a, TokenKind::Word(WordMetadata::default())), + Expr::Bool(a) => constant_token!(doc, a, TokenKind::Word(WordMetadata::default())), Expr::Int(int) => todo!(), Expr::Float(float) => todo!(), - Expr::Numeric(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), - Expr::Str(text) => parse_english(text.get(), parser, offset), - Expr::Code(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), + Expr::Numeric(a) => constant_token!(doc, a, TokenKind::Unlintable), + Expr::Str(text) => parse_english(text.get(), doc, parser, &text.span()), + Expr::Code(a) => constant_token!(doc, a, TokenKind::Unlintable), Expr::Content(content_block) => { - recursive_env(&mut content_block.body().exprs(), doc, parser, offset) + recursive_env(&mut content_block.body().exprs(), doc, parser) } - Expr::Parenthesized(parenthesized) => map_token(parenthesized.expr(), doc, parser, offset), + Expr::Parenthesized(parenthesized) => map_token(parenthesized.expr(), doc, parser), Expr::Array(array) => Some( array .items() .filter_map(|i| { if let typst_syntax::ast::ArrayItem::Pos(e) = i { - map_token(e, doc, parser, offset) + map_token(e, doc, parser) } else { None } @@ -154,63 +139,59 @@ fn map_token( .collect_vec(), ), // TODO: actually parse dictionaries - Expr::Dict(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), - Expr::Unary(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), - Expr::Binary(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), + Expr::Dict(a) => constant_token!(doc, a, TokenKind::Unlintable), + Expr::Unary(a) => constant_token!(doc, a, TokenKind::Unlintable), + Expr::Binary(a) => constant_token!(doc, a, TokenKind::Unlintable), Expr::FieldAccess(field_access) => merge_expr!( - map_token(field_access.target(), doc, parser, offset), + map_token(field_access.target(), doc, parser), constant_token!( - offset, doc, field_access.field(), TokenKind::Word(WordMetadata::default()) ) ), - Expr::FuncCall(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), - Expr::Closure(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), - Expr::Let(let_binding) => let_binding - .init() - .and_then(|e| map_token(e, doc, parser, offset)), + Expr::FuncCall(a) => constant_token!(doc, a, TokenKind::Unlintable), + Expr::Closure(a) => constant_token!(doc, a, TokenKind::Unlintable), + Expr::Let(let_binding) => let_binding.init().and_then(|e| map_token(e, doc, parser)), Expr::DestructAssign(destruct_assignment) => { - map_token(destruct_assignment.value(), doc, parser, offset) + map_token(destruct_assignment.value(), doc, parser) } Expr::Set(set_rule) => merge_expr!( - map_token(set_rule.target(), doc, parser, offset), - map_token(set_rule.condition()?, doc, parser, offset) + map_token(set_rule.target(), doc, parser), + map_token(set_rule.condition()?, doc, parser) ), Expr::Show(show_rule) => merge_expr!( - map_token(show_rule.transform(), doc, parser, offset), - map_token(show_rule.selector()?, doc, parser, offset) + map_token(show_rule.transform(), doc, parser), + map_token(show_rule.selector()?, doc, parser) ), - Expr::Contextual(contextual) => map_token(contextual.body(), doc, parser, offset), + Expr::Contextual(contextual) => map_token(contextual.body(), doc, parser), Expr::Conditional(conditional) => merge_expr!( - map_token(conditional.condition(), doc, parser, offset), - map_token(conditional.if_body(), doc, parser, offset), - map_token(conditional.else_body()?, doc, parser, offset) + map_token(conditional.condition(), doc, parser), + map_token(conditional.if_body(), doc, parser), + map_token(conditional.else_body()?, doc, parser) ), Expr::While(while_loop) => merge_expr!( - map_token(while_loop.condition(), doc, parser, offset), - map_token(while_loop.body(), doc, parser, offset) + map_token(while_loop.condition(), doc, parser), + map_token(while_loop.body(), doc, parser) ), Expr::For(for_loop) => merge_expr!( - map_token(for_loop.iterable(), doc, parser, offset), - map_token(for_loop.body(), doc, parser, offset) + map_token(for_loop.iterable(), doc, parser), + map_token(for_loop.body(), doc, parser) ), Expr::Import(module_import) => { merge_expr!( - map_token(module_import.source(), doc, parser, offset), + map_token(module_import.source(), doc, parser), constant_token!( - offset, doc, module_import.new_name()?, TokenKind::Word(WordMetadata::default()) ) ) } - Expr::Include(module_include) => map_token(module_include.source(), doc, parser, offset), - Expr::Break(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), - Expr::Continue(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), - Expr::Return(a) => constant_token!(offset, doc, a, TokenKind::Unlintable), + Expr::Include(module_include) => map_token(module_include.source(), doc, parser), + Expr::Break(a) => constant_token!(doc, a, TokenKind::Unlintable), + Expr::Continue(a) => constant_token!(doc, a, TokenKind::Unlintable), + Expr::Return(a) => constant_token!(doc, a, TokenKind::Unlintable), } } @@ -222,13 +203,12 @@ impl Parser for Typst { let typst_document = typst_syntax::Source::detached(source_str); let typst_tree = Markup::from_untyped(typst_document.root()) .expect("Unable to create typst document from parsed tree!"); - let mut offset = 0; // NOTE: the range spits out __byte__ indices, not char indices. // This is why we keep track above. typst_tree .exprs() - .filter_map(|ex| map_token(ex, &typst_document, &mut english_parser, &mut offset)) + .filter_map(|ex| map_token(ex, &typst_document, &mut english_parser)) .flatten() .collect_vec() } From 042511096bb624611fe53e45132ae10debaa3f56 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Wed, 27 Nov 2024 11:45:34 -0700 Subject: [PATCH 08/51] feat(#230): parse numbers properly and add test for numbers --- harper-core/src/parsers/typst.rs | 54 ++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index 8edfe427..3621bc7c 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -116,8 +116,12 @@ fn map_token( Expr::None(a) => constant_token!(doc, a, TokenKind::Word(WordMetadata::default())), Expr::Auto(a) => constant_token!(doc, a, TokenKind::Word(WordMetadata::default())), Expr::Bool(a) => constant_token!(doc, a, TokenKind::Word(WordMetadata::default())), - Expr::Int(int) => todo!(), - Expr::Float(float) => todo!(), + Expr::Int(int) => { + constant_token!(doc, int, TokenKind::Number((int.get() as f64).into(), None)) + } + Expr::Float(float) => { + constant_token!(doc, float, TokenKind::Number(float.get().into(), None)) + } Expr::Numeric(a) => constant_token!(doc, a, TokenKind::Unlintable), Expr::Str(text) => parse_english(text.get(), doc, parser, &text.span()), Expr::Code(a) => constant_token!(doc, a, TokenKind::Unlintable), @@ -216,6 +220,8 @@ impl Parser for Typst { #[cfg(test)] mod tests { + use ordered_float::OrderedFloat; + use super::Typst; use crate::{parsers::StrParser, Punctuation, TokenKind}; @@ -232,6 +238,50 @@ mod tests { assert!(matches!(token_kinds.as_slice(), &[TokenKind::Word(_),])) } + #[test] + fn number() { + let source = r"The number 12 is larger than 11, but is much less than 11!"; + + let tokens = Typst.parse_str(source); + + let token_kinds = tokens.iter().map(|t| t.kind).collect::>(); + + dbg!(&token_kinds); + + assert!(matches!( + token_kinds.as_slice(), + &[ + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Number(OrderedFloat(12.0), None), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Number(OrderedFloat(11.0), None), + TokenKind::Punctuation(Punctuation::Comma), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Word(_), + TokenKind::Space(1), + TokenKind::Number(OrderedFloat(11.0), None), + TokenKind::Punctuation(Punctuation::Bang), + ] + )) + } + #[test] fn sentence() { let source = r"This is a sentence, it does not have any particularly interesting elements of the typst syntax."; From 1f43b27eefd3e6712b30af94e91470a5c271b2e2 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Wed, 27 Nov 2024 14:23:52 -0700 Subject: [PATCH 09/51] feat(#230): consolidate words separated by apostrophes into possessives or conjunctions --- harper-core/src/parsers/typst.rs | 51 ++++++++++++++++++-- harper-core/src/patterns/sequence_pattern.rs | 1 + 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index 3621bc7c..682413e8 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -1,9 +1,13 @@ use itertools::Itertools; - +use std::collections::VecDeque; use typst_syntax::ast::{AstNode, Expr, Markup}; use super::{Parser, PlainEnglish}; -use crate::{parsers::StrParser, Punctuation, Token, TokenKind, WordMetadata}; +use crate::{ + parsers::StrParser, + patterns::{PatternExt, SequencePattern}, + ConjunctionData, Lrc, Punctuation, Span, Token, TokenKind, VecExt, WordMetadata, +}; /// A parser that wraps the [`PlainEnglish`] parser that allows one to parse /// Typst files. @@ -199,6 +203,13 @@ fn map_token( } } +thread_local! { + static WORD_APOSTROPHE_WORD: Lrc = Lrc::new(SequencePattern::default() + .then_any_word() + .then_apostrophe() + .then_any_word()); +} + impl Parser for Typst { fn parse(&mut self, source: &[char]) -> Vec { let mut english_parser = PlainEnglish; @@ -210,11 +221,43 @@ impl Parser for Typst { // NOTE: the range spits out __byte__ indices, not char indices. // This is why we keep track above. - typst_tree + let mut tokens = typst_tree .exprs() .filter_map(|ex| map_token(ex, &typst_document, &mut english_parser)) .flatten() - .collect_vec() + .collect_vec(); + + // Consolidate conjunctions + let mut to_remove = VecDeque::default(); + for tok_span in WORD_APOSTROPHE_WORD + .with(|v| v.clone()) + .find_all_matches(&tokens, source) + { + let start_tok = &tokens[tok_span.start]; + let end_tok = &tokens[tok_span.end - 1]; + let char_span = Span::new(start_tok.span.start, end_tok.span.end); + + if let TokenKind::Word(metadata) = start_tok.kind { + if end_tok.span.get_content(source) == &['s'] { + if let Some(mut noun) = metadata.noun { + noun.is_possessive = Some(true); + } + } else { + tokens[tok_span.start].kind = TokenKind::Word(WordMetadata { + conjunction: Some(ConjunctionData {}), + ..metadata + }); + }; + + tokens[tok_span.start].span = char_span; + to_remove.extend(tok_span.start + 1..tok_span.end); + } else { + panic!("Apostrophe consolidation does not start with Word Token!") + } + } + tokens.remove_indices(to_remove.into_iter().sorted().unique().collect()); + + tokens } } diff --git a/harper-core/src/patterns/sequence_pattern.rs b/harper-core/src/patterns/sequence_pattern.rs index c26a2035..02a3788e 100644 --- a/harper-core/src/patterns/sequence_pattern.rs +++ b/harper-core/src/patterns/sequence_pattern.rs @@ -43,6 +43,7 @@ impl SequencePattern { gen_then_from_is!(case_separator); gen_then_from_is!(adverb); gen_then_from_is!(adjective); + gen_then_from_is!(apostrophe); pub fn then_exact_word(mut self, word: &'static str) -> Self { self.token_patterns From 835c3964c45b61d977d23e06cd55d272655a6be6 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Wed, 27 Nov 2024 14:25:58 -0700 Subject: [PATCH 10/51] fix(clippy): satisfy clippy --- harper-core/src/parsers/typst.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index 682413e8..1ce641ba 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -238,7 +238,7 @@ impl Parser for Typst { let char_span = Span::new(start_tok.span.start, end_tok.span.end); if let TokenKind::Word(metadata) = start_tok.kind { - if end_tok.span.get_content(source) == &['s'] { + if end_tok.span.get_content(source) == ['s'] { if let Some(mut noun) = metadata.noun { noun.is_possessive = Some(true); } From 4dbc264cda939a32deab24c98f7409411cff040e Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Wed, 27 Nov 2024 15:29:18 -0700 Subject: [PATCH 11/51] feat(#230): simplify possessive-conjunction logic and add respective tests --- harper-core/src/parsers/typst.rs | 89 +++++++++++++++++++++++++------- harper-core/src/word_metadata.rs | 10 ++-- 2 files changed, 75 insertions(+), 24 deletions(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index 1ce641ba..92ba5bb5 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -6,7 +6,7 @@ use super::{Parser, PlainEnglish}; use crate::{ parsers::StrParser, patterns::{PatternExt, SequencePattern}, - ConjunctionData, Lrc, Punctuation, Span, Token, TokenKind, VecExt, WordMetadata, + ConjunctionData, Lrc, NounData, Punctuation, Span, Token, TokenKind, VecExt, WordMetadata, }; /// A parser that wraps the [`PlainEnglish`] parser that allows one to parse @@ -238,16 +238,26 @@ impl Parser for Typst { let char_span = Span::new(start_tok.span.start, end_tok.span.end); if let TokenKind::Word(metadata) = start_tok.kind { - if end_tok.span.get_content(source) == ['s'] { - if let Some(mut noun) = metadata.noun { - noun.is_possessive = Some(true); - } - } else { - tokens[tok_span.start].kind = TokenKind::Word(WordMetadata { - conjunction: Some(ConjunctionData {}), - ..metadata + tokens[tok_span.start].kind = + TokenKind::Word(if end_tok.span.get_content(source) == ['s'] { + WordMetadata { + noun: Some(NounData { + is_possessive: Some(true), + ..metadata.noun.unwrap_or_default() + }), + conjunction: None, + ..metadata + } + } else { + WordMetadata { + noun: metadata.noun.map(|noun| NounData { + is_possessive: Some(false), + ..noun + }), + conjunction: Some(ConjunctionData {}), + ..metadata + } }); - }; tokens[tok_span.start].span = char_span; to_remove.extend(tok_span.start + 1..tok_span.end); @@ -266,7 +276,7 @@ mod tests { use ordered_float::OrderedFloat; use super::Typst; - use crate::{parsers::StrParser, Punctuation, TokenKind}; + use crate::{parsers::StrParser, NounData, Punctuation, TokenKind, WordMetadata}; #[test] fn conjunction() { @@ -278,12 +288,38 @@ mod tests { dbg!(&token_kinds); - assert!(matches!(token_kinds.as_slice(), &[TokenKind::Word(_),])) + assert_eq!(token_kinds.len(), 1); + assert!(token_kinds.into_iter().all(|t| t.is_conjunction())) + } + + #[test] + fn possessive() { + let source = r"person's"; + + let tokens = Typst.parse_str(source); + + let token_kinds = tokens.iter().map(|t| t.kind).collect::>(); + + dbg!(&token_kinds); + + assert_eq!(token_kinds.len(), 1); + assert!(token_kinds.into_iter().all(|t| { + matches!( + t, + TokenKind::Word(WordMetadata { + noun: Some(NounData { + is_possessive: Some(true), + .. + }), + .. + }) + ) + })) } #[test] fn number() { - let source = r"The number 12 is larger than 11, but is much less than 11!"; + let source = r"12 is larger than 11, but much less than 11!"; let tokens = Typst.parse_str(source); @@ -294,10 +330,6 @@ mod tests { assert!(matches!( token_kinds.as_slice(), &[ - TokenKind::Word(_), - TokenKind::Space(1), - TokenKind::Word(_), - TokenKind::Space(1), TokenKind::Number(OrderedFloat(12.0), None), TokenKind::Space(1), TokenKind::Word(_), @@ -317,14 +349,33 @@ mod tests { TokenKind::Space(1), TokenKind::Word(_), TokenKind::Space(1), - TokenKind::Word(_), - TokenKind::Space(1), TokenKind::Number(OrderedFloat(11.0), None), TokenKind::Punctuation(Punctuation::Bang), ] )) } + #[test] + fn math_unlintable() { + let source = r"$12 > 11$, $12 << 11!$"; + + let tokens = Typst.parse_str(source); + + let token_kinds = tokens.iter().map(|t| t.kind).collect::>(); + + dbg!(&token_kinds); + + assert!(matches!( + token_kinds.as_slice(), + &[ + TokenKind::Unlintable, + TokenKind::Punctuation(Punctuation::Comma), + TokenKind::Space(1), + TokenKind::Unlintable, + ] + )) + } + #[test] fn sentence() { let source = r"This is a sentence, it does not have any particularly interesting elements of the typst syntax."; diff --git a/harper-core/src/word_metadata.rs b/harper-core/src/word_metadata.rs index 314f855d..326a3572 100644 --- a/harper-core/src/word_metadata.rs +++ b/harper-core/src/word_metadata.rs @@ -120,7 +120,7 @@ pub enum Tense { Future, } -#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash)] +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)] pub struct VerbData { pub is_linking: Option, pub tense: Option, @@ -136,7 +136,7 @@ impl VerbData { } } -#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash)] +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)] pub struct NounData { pub is_proper: Option, pub is_plural: Option, @@ -156,7 +156,7 @@ impl NounData { } } -#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash)] +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)] pub struct AdjectiveData {} impl AdjectiveData { @@ -166,7 +166,7 @@ impl AdjectiveData { } } -#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash)] +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)] pub struct AdverbData {} impl AdverbData { @@ -176,7 +176,7 @@ impl AdverbData { } } -#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash)] +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Hash, Default)] pub struct ConjunctionData {} impl ConjunctionData { From f3eda9237dfeb55cf0ac190070abb1b755bc7948 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Thu, 28 Nov 2024 21:20:06 -0700 Subject: [PATCH 12/51] feat(#230): create additional parsers for complex dictionary parsing --- harper-core/src/parsers/typst.rs | 85 +++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 2 deletions(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index 92ba5bb5..5d2b235e 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -64,6 +64,88 @@ fn parse_english( Some(res) } +fn parse_dict( + dict: &mut dyn Iterator, + doc: &typst_syntax::Source, + parser: &mut PlainEnglish, +) -> Option> { + Some( + dict.filter_map(|di| match di { + typst_syntax::ast::DictItem::Named(named) => merge_expr!( + constant_token!(doc, named.name(), TokenKind::Word(WordMetadata::default())), + map_token(named.expr(), doc, parser), + parse_pattern(named.pattern(), doc, parser) + ), + typst_syntax::ast::DictItem::Keyed(keyed) => merge_expr!( + map_token(keyed.key(), doc, parser), + map_token(keyed.expr(), doc, parser) + ), + typst_syntax::ast::DictItem::Spread(spread) => spread.sink_ident().map_or_else( + || { + spread + .sink_expr() + .and_then(|expr| map_token(expr, doc, parser)) + }, + |ident| constant_token!(doc, ident, TokenKind::Word(WordMetadata::default())), + ), + }) + .flatten() + .collect(), + ) +} + +fn parse_pattern( + pat: typst_syntax::ast::Pattern, + doc: &typst_syntax::Source, + parser: &mut PlainEnglish, +) -> Option> { + match pat { + typst_syntax::ast::Pattern::Normal(expr) => map_token(expr, doc, parser), + typst_syntax::ast::Pattern::Placeholder(underscore) => { + constant_token!(doc, underscore, TokenKind::Unlintable) + } + typst_syntax::ast::Pattern::Parenthesized(parenthesized) => merge_expr!( + map_token(parenthesized.expr(), doc, parser), + parse_pattern(parenthesized.pattern(), doc, parser) + ), + typst_syntax::ast::Pattern::Destructuring(destructuring) => Some( + destructuring + .items() + .filter_map(|item| match item { + typst_syntax::ast::DestructuringItem::Pattern(pattern) => { + parse_pattern(pattern, doc, parser) + } + typst_syntax::ast::DestructuringItem::Named(named) => merge_expr!( + constant_token!( + doc, + named.name(), + TokenKind::Word(WordMetadata::default()) + ), + parse_pattern(named.pattern(), doc, parser) + ), + typst_syntax::ast::DestructuringItem::Spread(spread) => { + spread.sink_ident().map_or_else( + || { + spread + .sink_expr() + .and_then(|expr| map_token(expr, doc, parser)) + }, + |ident| { + constant_token!( + doc, + ident, + TokenKind::Word(WordMetadata::default()) + ) + }, + ) + } + }) + .flatten() + .collect(), + ), + } +} + fn map_token( ex: typst_syntax::ast::Expr, doc: &typst_syntax::Source, @@ -146,8 +228,7 @@ fn map_token( .flatten() .collect_vec(), ), - // TODO: actually parse dictionaries - Expr::Dict(a) => constant_token!(doc, a, TokenKind::Unlintable), + Expr::Dict(a) => parse_dict(&mut a.items(), doc, parser), Expr::Unary(a) => constant_token!(doc, a, TokenKind::Unlintable), Expr::Binary(a) => constant_token!(doc, a, TokenKind::Unlintable), Expr::FieldAccess(field_access) => merge_expr!( From 24e055182a7f7ae1ac48c5d8683f0dfe841d6ee0 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Fri, 29 Nov 2024 12:14:26 -0700 Subject: [PATCH 13/51] feat(#230): add some tests for dictionary parsing, and improve dict parsing to fit better --- harper-core/src/parsers/typst.rs | 88 ++++++++++++++++++++++---------- 1 file changed, 61 insertions(+), 27 deletions(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index 5d2b235e..2beac0af 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -53,15 +53,17 @@ fn parse_english( parser: &mut PlainEnglish, span: &typst_syntax::Span, ) -> Option> { - let res = parser - .parse_str(str.into()) - .into_iter() - .map(|mut t| { - t.span.push_by(doc.range(*span).unwrap().start); - t - }) - .collect_vec(); - Some(res) + let offset = doc.range(*span).unwrap().start; + Some( + parser + .parse_str(str.into()) + .into_iter() + .map(|mut t| { + t.span.push_by(offset); + t + }) + .collect_vec(), + ) } fn parse_dict( @@ -73,8 +75,7 @@ fn parse_dict( dict.filter_map(|di| match di { typst_syntax::ast::DictItem::Named(named) => merge_expr!( constant_token!(doc, named.name(), TokenKind::Word(WordMetadata::default())), - map_token(named.expr(), doc, parser), - parse_pattern(named.pattern(), doc, parser) + map_token(named.expr(), doc, parser) ), typst_syntax::ast::DictItem::Keyed(keyed) => merge_expr!( map_token(keyed.key(), doc, parser), @@ -241,7 +242,15 @@ fn map_token( ), Expr::FuncCall(a) => constant_token!(doc, a, TokenKind::Unlintable), Expr::Closure(a) => constant_token!(doc, a, TokenKind::Unlintable), - Expr::Let(let_binding) => let_binding.init().and_then(|e| map_token(e, doc, parser)), + Expr::Let(let_binding) => merge_expr!( + match let_binding.kind() { + typst_syntax::ast::LetBindingKind::Normal(pattern) => + parse_pattern(pattern, doc, parser), + typst_syntax::ast::LetBindingKind::Closure(ident) => + constant_token!(doc, ident, TokenKind::Word(WordMetadata::default())), + }, + let_binding.init().and_then(|e| map_token(e, doc, parser)) + ), Expr::DestructAssign(destruct_assignment) => { map_token(destruct_assignment.value(), doc, parser) } @@ -354,6 +363,7 @@ impl Parser for Typst { #[cfg(test)] mod tests { + use itertools::Itertools; use ordered_float::OrderedFloat; use super::Typst; @@ -361,12 +371,10 @@ mod tests { #[test] fn conjunction() { - let source = r"doesn't"; + let source = "doesn't"; let tokens = Typst.parse_str(source); - let token_kinds = tokens.iter().map(|t| t.kind).collect::>(); - dbg!(&token_kinds); assert_eq!(token_kinds.len(), 1); @@ -375,12 +383,10 @@ mod tests { #[test] fn possessive() { - let source = r"person's"; + let source = "person's"; let tokens = Typst.parse_str(source); - let token_kinds = tokens.iter().map(|t| t.kind).collect::>(); - dbg!(&token_kinds); assert_eq!(token_kinds.len(), 1); @@ -400,12 +406,10 @@ mod tests { #[test] fn number() { - let source = r"12 is larger than 11, but much less than 11!"; + let source = "12 is larger than 11, but much less than 11!"; let tokens = Typst.parse_str(source); - let token_kinds = tokens.iter().map(|t| t.kind).collect::>(); - dbg!(&token_kinds); assert!(matches!( @@ -438,12 +442,10 @@ mod tests { #[test] fn math_unlintable() { - let source = r"$12 > 11$, $12 << 11!$"; + let source = "$12 > 11$, $12 << 11!$"; let tokens = Typst.parse_str(source); - let token_kinds = tokens.iter().map(|t| t.kind).collect::>(); - dbg!(&token_kinds); assert!(matches!( @@ -458,13 +460,45 @@ mod tests { } #[test] - fn sentence() { - let source = r"This is a sentence, it does not have any particularly interesting elements of the typst syntax."; + fn dict_parsing() { + let source = r#"#let dict = ( + name: "Typst", + born: 2019, + )"#; let tokens = Typst.parse_str(source); - let token_kinds = tokens.iter().map(|t| t.kind).collect::>(); + dbg!(&token_kinds); + + let typst_document = typst_syntax::Source::detached(source); + let typst_tree = ::from_untyped( + typst_document.root(), + ) + .expect("Unable to create typst document from parsed tree!"); + dbg!(typst_tree.exprs().collect_vec()); + let charslice = source.chars().collect_vec(); + assert_eq!(tokens[2].span.get_content_string(&charslice), "Typst"); + assert!(matches!( + token_kinds.as_slice(), + &[ + TokenKind::Word(_), + TokenKind::Word(_), + TokenKind::Punctuation(Punctuation::Quote { .. }), + TokenKind::Word(_), + TokenKind::Punctuation(Punctuation::Quote { .. }), + TokenKind::Word(_), + TokenKind::Number(OrderedFloat(2019.0), None), + ] + )) + } + + #[test] + fn sentence() { + let source = "This is a sentence, it does not have any particularly interesting elements of the typst syntax."; + + let tokens = Typst.parse_str(source); + let token_kinds = tokens.iter().map(|t| t.kind).collect::>(); dbg!(&token_kinds); assert!(matches!( From c63d41a1c300e6eb17c889003e047fb82e4dc467 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Wed, 4 Dec 2024 11:36:40 -0700 Subject: [PATCH 14/51] fix(#230): fix dict parsing by manually getting document content in span so quotes aren't escaped --- harper-core/src/parsers/typst.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index 2beac0af..df10640a 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -210,7 +210,10 @@ fn map_token( constant_token!(doc, float, TokenKind::Number(float.get().into(), None)) } Expr::Numeric(a) => constant_token!(doc, a, TokenKind::Unlintable), - Expr::Str(text) => parse_english(text.get(), doc, parser, &text.span()), + Expr::Str(text) => { + // Using `text.get()` doesn't work here, because it escapes quotes + parse_english(doc.get(doc.range(text.span())?)?, doc, parser, &text.span()) + } Expr::Code(a) => constant_token!(doc, a, TokenKind::Unlintable), Expr::Content(content_block) => { recursive_env(&mut content_block.body().exprs(), doc, parser) @@ -478,7 +481,7 @@ mod tests { dbg!(typst_tree.exprs().collect_vec()); let charslice = source.chars().collect_vec(); - assert_eq!(tokens[2].span.get_content_string(&charslice), "Typst"); + assert_eq!(tokens[3].span.get_content_string(&charslice), "Typst"); assert!(matches!( token_kinds.as_slice(), &[ From f57d6c2300687c85f754cf778dbddfcdc23b17b7 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Wed, 4 Dec 2024 11:39:09 -0700 Subject: [PATCH 15/51] fix(#230): remove debug print of typst ast in test --- harper-core/src/parsers/typst.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index df10640a..36d4d9a1 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -473,13 +473,6 @@ mod tests { let token_kinds = tokens.iter().map(|t| t.kind).collect::>(); dbg!(&token_kinds); - let typst_document = typst_syntax::Source::detached(source); - let typst_tree = ::from_untyped( - typst_document.root(), - ) - .expect("Unable to create typst document from parsed tree!"); - dbg!(typst_tree.exprs().collect_vec()); - let charslice = source.chars().collect_vec(); assert_eq!(tokens[3].span.get_content_string(&charslice), "Typst"); assert!(matches!( From 550cf20226e26941cab3a62a81b9306a121cfd3b Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Wed, 4 Dec 2024 11:43:18 -0700 Subject: [PATCH 16/51] style(#230): expand explainer on str parsing --- harper-core/src/parsers/typst.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index 36d4d9a1..c49bd73e 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -211,7 +211,8 @@ fn map_token( } Expr::Numeric(a) => constant_token!(doc, a, TokenKind::Unlintable), Expr::Str(text) => { - // Using `text.get()` doesn't work here, because it escapes quotes + // Using `text.get()` doesn't work here, because it escapes quotes which throws off + // the span parse_english(doc.get(doc.range(text.span())?)?, doc, parser, &text.span()) } Expr::Code(a) => constant_token!(doc, a, TokenKind::Unlintable), From 7cd135f89925798062e3521f8334caf9a1a36528 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Fri, 20 Dec 2024 12:45:53 -0600 Subject: [PATCH 17/51] feat(#230): remove quotes from Str parsing --- Cargo.lock | 2 +- harper-core/src/parsers/typst.rs | 74 ++++++++++++++++++++++++-------- 2 files changed, 58 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c263f4b9..9367fa74 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -892,7 +892,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" dependencies = [ "equivalent", - "hashbrown 0.15.1", + "hashbrown 0.15.2", ] [[package]] diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index c49bd73e..af0fedc7 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -211,9 +211,18 @@ fn map_token( } Expr::Numeric(a) => constant_token!(doc, a, TokenKind::Unlintable), Expr::Str(text) => { - // Using `text.get()` doesn't work here, because it escapes quotes which throws off - // the span - parse_english(doc.get(doc.range(text.span())?)?, doc, parser, &text.span()) + let offset = doc.range(text.span()).unwrap().start + 1; + let text = text.to_untyped().text(); + Some( + parser + .parse_str(&text[1..text.len() - 1]) + .into_iter() + .map(|mut t| { + t.span.push_by(offset); + t + }) + .collect_vec(), + ) } Expr::Code(a) => constant_token!(doc, a, TokenKind::Unlintable), Expr::Content(content_block) => { @@ -378,7 +387,7 @@ mod tests { let source = "doesn't"; let tokens = Typst.parse_str(source); - let token_kinds = tokens.iter().map(|t| t.kind).collect::>(); + let token_kinds = tokens.iter().map(|t| t.kind).collect_vec(); dbg!(&token_kinds); assert_eq!(token_kinds.len(), 1); @@ -390,7 +399,7 @@ mod tests { let source = "person's"; let tokens = Typst.parse_str(source); - let token_kinds = tokens.iter().map(|t| t.kind).collect::>(); + let token_kinds = tokens.iter().map(|t| t.kind).collect_vec(); dbg!(&token_kinds); assert_eq!(token_kinds.len(), 1); @@ -413,7 +422,7 @@ mod tests { let source = "12 is larger than 11, but much less than 11!"; let tokens = Typst.parse_str(source); - let token_kinds = tokens.iter().map(|t| t.kind).collect::>(); + let token_kinds = tokens.iter().map(|t| t.kind).collect_vec(); dbg!(&token_kinds); assert!(matches!( @@ -449,7 +458,7 @@ mod tests { let source = "$12 > 11$, $12 << 11!$"; let tokens = Typst.parse_str(source); - let token_kinds = tokens.iter().map(|t| t.kind).collect::>(); + let token_kinds = tokens.iter().map(|t| t.kind).collect_vec(); dbg!(&token_kinds); assert!(matches!( @@ -471,21 +480,52 @@ mod tests { )"#; let tokens = Typst.parse_str(source); - let token_kinds = tokens.iter().map(|t| t.kind).collect::>(); + let token_kinds = tokens.iter().map(|t| t.kind).collect_vec(); dbg!(&token_kinds); let charslice = source.chars().collect_vec(); - assert_eq!(tokens[3].span.get_content_string(&charslice), "Typst"); + assert_eq!(tokens[2].span.get_content_string(&charslice), "Typst"); + assert!(matches!( token_kinds.as_slice(), &[ - TokenKind::Word(_), - TokenKind::Word(_), - TokenKind::Punctuation(Punctuation::Quote { .. }), - TokenKind::Word(_), - TokenKind::Punctuation(Punctuation::Quote { .. }), - TokenKind::Word(_), - TokenKind::Number(OrderedFloat(2019.0), None), + TokenKind::Word(_), // identifier + TokenKind::Word(_), // key 1 + TokenKind::Word(_), // value 1 + TokenKind::Word(_), // key 2 + TokenKind::Number(OrderedFloat(2019.0), None), // value 2 + ] + )) + } + + #[test] + fn str_parsing() { + let source_with_quotes = r#"#let ident = "This is a string""#; + let source_no_quotes = r#"#let ident = This is a string"#; + + let with_quotes_token_kinds = Typst + .parse_str(source_with_quotes) + .iter() + .map(|t| t.kind) + .collect_vec(); + let no_quotes_token_kinds = Typst + .parse_str(source_no_quotes) + .iter() + .map(|t| t.kind) + .collect_vec(); + + assert_eq!(with_quotes_token_kinds, no_quotes_token_kinds); + assert!(matches!( + &with_quotes_token_kinds.as_slice(), + &[ + TokenKind::Word(_), // identifier + TokenKind::Word(_), // This + TokenKind::Space(1), + TokenKind::Word(_), // is + TokenKind::Space(1), + TokenKind::Word(_), // a + TokenKind::Space(1), + TokenKind::Word(_), // string ] )) } @@ -495,7 +535,7 @@ mod tests { let source = "This is a sentence, it does not have any particularly interesting elements of the typst syntax."; let tokens = Typst.parse_str(source); - let token_kinds = tokens.iter().map(|t| t.kind).collect::>(); + let token_kinds = tokens.iter().map(|t| t.kind).collect_vec(); dbg!(&token_kinds); assert!(matches!( From 54418ff204be80c0b2a6a0dafb273e0902bbfb73 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Fri, 20 Dec 2024 15:07:16 -0600 Subject: [PATCH 18/51] fix(#230): remove improper test case --- harper-core/src/parsers/typst.rs | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index af0fedc7..151a8a96 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -500,23 +500,12 @@ mod tests { #[test] fn str_parsing() { - let source_with_quotes = r#"#let ident = "This is a string""#; - let source_no_quotes = r#"#let ident = This is a string"#; + let source = r#"#let ident = "This is a string""#; - let with_quotes_token_kinds = Typst - .parse_str(source_with_quotes) - .iter() - .map(|t| t.kind) - .collect_vec(); - let no_quotes_token_kinds = Typst - .parse_str(source_no_quotes) - .iter() - .map(|t| t.kind) - .collect_vec(); + let token_kinds = Typst.parse_str(source).iter().map(|t| t.kind).collect_vec(); - assert_eq!(with_quotes_token_kinds, no_quotes_token_kinds); assert!(matches!( - &with_quotes_token_kinds.as_slice(), + &token_kinds.as_slice(), &[ TokenKind::Word(_), // identifier TokenKind::Word(_), // This From 533677891e65b80fba132a084116e037a8c1e77c Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Fri, 27 Dec 2024 12:13:51 -0600 Subject: [PATCH 19/51] tests(#230): add test using unicode apostrophe --- harper-cli/src/main.rs | 12 +++--- harper-core/src/parsers/typst.rs | 63 +++++++++++++++++++------------- 2 files changed, 43 insertions(+), 32 deletions(-) diff --git a/harper-cli/src/main.rs b/harper-cli/src/main.rs index cdedb164..a66d4a89 100644 --- a/harper-cli/src/main.rs +++ b/harper-cli/src/main.rs @@ -143,16 +143,14 @@ fn load_file(file: &Path) -> anyhow::Result<(Document, String)> { let source = std::fs::read_to_string(file)?; let mut parser: Box = - if let Some("md") = file.extension().map(|v| v.to_str().unwrap()) { - Box::new(Markdown) - } else if let Some("typ") = file.extension().map(|v| v.to_str().unwrap()) { - Box::new(Typst) - } else { - Box::new( + match file.extension().map(|v| v.to_str().unwrap()) { + Some("md") => Box::new(Markdown), + Some("typ") => Box::new(Typst), + _ => Box::new( CommentParser::new_from_filename(file) .map(Box::new) .ok_or(format_err!("Could not detect language ID."))?, - ) + ), }; Ok((Document::new_curated(&source, &mut parser), source)) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index 151a8a96..f620cc54 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -489,11 +489,11 @@ mod tests { assert!(matches!( token_kinds.as_slice(), &[ - TokenKind::Word(_), // identifier - TokenKind::Word(_), // key 1 - TokenKind::Word(_), // value 1 - TokenKind::Word(_), // key 2 - TokenKind::Number(OrderedFloat(2019.0), None), // value 2 + TokenKind::Word(_), // Identifier + TokenKind::Word(_), // Key 1 + TokenKind::Word(_), // Value 1 + TokenKind::Word(_), // Key 2 + TokenKind::Number(OrderedFloat(2019.0), None), // Value 2 ] )) } @@ -507,21 +507,21 @@ mod tests { assert!(matches!( &token_kinds.as_slice(), &[ - TokenKind::Word(_), // identifier + TokenKind::Word(_), // Identifier TokenKind::Word(_), // This TokenKind::Space(1), - TokenKind::Word(_), // is + TokenKind::Word(_), // Is TokenKind::Space(1), - TokenKind::Word(_), // a + TokenKind::Word(_), // A TokenKind::Space(1), - TokenKind::Word(_), // string + TokenKind::Word(_), // String ] )) } #[test] fn sentence() { - let source = "This is a sentence, it does not have any particularly interesting elements of the typst syntax."; + let source = "This is a sentence, it is not interesting."; let tokens = Typst.parse_str(source); let token_kinds = tokens.iter().map(|t| t.kind).collect_vec(); @@ -546,24 +546,37 @@ mod tests { TokenKind::Word(_), TokenKind::Space(1), TokenKind::Word(_), + TokenKind::Punctuation(Punctuation::Period), + ] + )) + } + + #[test] + fn smart_apostrophe_newline() { + let source = r#"group’s +writing"#; + + let tokens = Typst.parse_str(source); + let token_kinds = tokens.iter().map(|t| t.kind).collect_vec(); + dbg!(&token_kinds); + + let charslice = source.chars().collect_vec(); + assert_eq!(tokens[2].span.get_content_string(&charslice), "writing"); + + assert!(matches!( + token_kinds.as_slice(), + &[ + TokenKind::Word(WordMetadata { + noun: Some(NounData { + is_possessive: Some(true), + .. + }), + .. + }), TokenKind::Space(1), TokenKind::Word(_), TokenKind::Space(1), - TokenKind::Word(_), - TokenKind::Space(1), - TokenKind::Word(_), - TokenKind::Space(1), - TokenKind::Word(_), - TokenKind::Space(1), - TokenKind::Word(_), - TokenKind::Space(1), - TokenKind::Word(_), - TokenKind::Space(1), - TokenKind::Word(_), - TokenKind::Space(1), - TokenKind::Word(_), - TokenKind::Punctuation(Punctuation::Period), ] - )) + )); } } From 60bb98694483964433ff96b75d8c130ad4c17b2d Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Fri, 27 Dec 2024 12:25:37 -0600 Subject: [PATCH 20/51] refactor(#230): simplify parsing by moving some helper functions inside function --- harper-core/src/parsers/typst.rs | 159 +++++++++++++++---------------- 1 file changed, 78 insertions(+), 81 deletions(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index f620cc54..6bb572df 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -34,19 +34,6 @@ macro_rules! merge_expr { }; } -fn recursive_env( - exprs: &mut dyn Iterator, - doc: &typst_syntax::Source, - parser: &mut PlainEnglish, -) -> Option> { - Some( - exprs - .filter_map(|e| map_token(e, doc, parser)) - .flatten() - .collect_vec(), - ) -} - fn parse_english( str: impl Into, doc: &typst_syntax::Source, @@ -75,17 +62,17 @@ fn parse_dict( dict.filter_map(|di| match di { typst_syntax::ast::DictItem::Named(named) => merge_expr!( constant_token!(doc, named.name(), TokenKind::Word(WordMetadata::default())), - map_token(named.expr(), doc, parser) + parse_expr(named.expr(), doc, parser) ), typst_syntax::ast::DictItem::Keyed(keyed) => merge_expr!( - map_token(keyed.key(), doc, parser), - map_token(keyed.expr(), doc, parser) + parse_expr(keyed.key(), doc, parser), + parse_expr(keyed.expr(), doc, parser) ), typst_syntax::ast::DictItem::Spread(spread) => spread.sink_ident().map_or_else( || { spread .sink_expr() - .and_then(|expr| map_token(expr, doc, parser)) + .and_then(|expr| parse_expr(expr, doc, parser)) }, |ident| constant_token!(doc, ident, TokenKind::Word(WordMetadata::default())), ), @@ -101,12 +88,12 @@ fn parse_pattern( parser: &mut PlainEnglish, ) -> Option> { match pat { - typst_syntax::ast::Pattern::Normal(expr) => map_token(expr, doc, parser), + typst_syntax::ast::Pattern::Normal(expr) => parse_expr(expr, doc, parser), typst_syntax::ast::Pattern::Placeholder(underscore) => { constant_token!(doc, underscore, TokenKind::Unlintable) } typst_syntax::ast::Pattern::Parenthesized(parenthesized) => merge_expr!( - map_token(parenthesized.expr(), doc, parser), + parse_expr(parenthesized.expr(), doc, parser), parse_pattern(parenthesized.pattern(), doc, parser) ), typst_syntax::ast::Pattern::Destructuring(destructuring) => Some( @@ -129,7 +116,7 @@ fn parse_pattern( || { spread .sink_expr() - .and_then(|expr| map_token(expr, doc, parser)) + .and_then(|expr| parse_expr(expr, doc, parser)) }, |ident| { constant_token!( @@ -147,49 +134,63 @@ fn parse_pattern( } } -fn map_token( +fn parse_expr( ex: typst_syntax::ast::Expr, doc: &typst_syntax::Source, parser: &mut PlainEnglish, ) -> Option> { + macro_rules! constant_token { + ($a:expr, $to:expr) => {{ + Some(vec![Token { + span: doc.range($a.span()).unwrap().into(), + kind: $to, + }]) + }}; + } + let mut nested_env = |exprs: &mut dyn Iterator| { + Some( + exprs + .filter_map(|e| parse_expr(e, doc, parser)) + .flatten() + .collect_vec(), + ) + }; + match ex { Expr::Text(text) => parse_english(text.get(), doc, parser, &text.span()), - Expr::Space(a) => constant_token!(doc, a, TokenKind::Space(1)), - Expr::Linebreak(a) => constant_token!(doc, a, TokenKind::Newline(1)), - Expr::Parbreak(a) => constant_token!(doc, a, TokenKind::ParagraphBreak), - Expr::Escape(a) => constant_token!(doc, a, TokenKind::Unlintable), - Expr::Shorthand(a) => constant_token!(doc, a, TokenKind::Unlintable), + Expr::Space(a) => constant_token!(a, TokenKind::Space(1)), + Expr::Linebreak(a) => constant_token!(a, TokenKind::Newline(1)), + Expr::Parbreak(a) => constant_token!(a, TokenKind::ParagraphBreak), + Expr::Escape(a) => constant_token!(a, TokenKind::Unlintable), + Expr::Shorthand(a) => constant_token!(a, TokenKind::Unlintable), Expr::SmartQuote(quote) => { if quote.double() { constant_token!( - doc, quote, TokenKind::Punctuation(Punctuation::Quote(crate::Quote { twin_loc: None })) ) } else { - constant_token!(doc, quote, TokenKind::Punctuation(Punctuation::Apostrophe)) + constant_token!(quote, TokenKind::Punctuation(Punctuation::Apostrophe)) } } - Expr::Strong(strong) => recursive_env(&mut strong.body().exprs(), doc, parser), - Expr::Emph(emph) => recursive_env(&mut emph.body().exprs(), doc, parser), - Expr::Raw(a) => constant_token!(doc, a, TokenKind::Unlintable), - Expr::Link(a) => constant_token!(doc, a, TokenKind::Url), + Expr::Strong(strong) => nested_env(&mut strong.body().exprs()), + Expr::Emph(emph) => nested_env(&mut emph.body().exprs()), + Expr::Raw(a) => constant_token!(a, TokenKind::Unlintable), + Expr::Link(a) => constant_token!(a, TokenKind::Url), Expr::Label(label) => parse_english(label.get(), doc, parser, &label.span()), Expr::Ref(a) => { - constant_token!(doc, a, TokenKind::Word(WordMetadata::default())) + constant_token!(a, TokenKind::Word(WordMetadata::default())) } - Expr::Heading(heading) => recursive_env(&mut heading.body().exprs(), doc, parser), - Expr::List(list_item) => recursive_env(&mut list_item.body().exprs(), doc, parser), - Expr::Enum(enum_item) => recursive_env(&mut enum_item.body().exprs(), doc, parser), - Expr::Term(term_item) => recursive_env( + Expr::Heading(heading) => nested_env(&mut heading.body().exprs()), + Expr::List(list_item) => nested_env(&mut list_item.body().exprs()), + Expr::Enum(enum_item) => nested_env(&mut enum_item.body().exprs()), + Expr::Term(term_item) => nested_env( &mut term_item .term() .exprs() .chain(term_item.description().exprs()), - doc, - parser, ), - Expr::Equation(a) => constant_token!(doc, a, TokenKind::Unlintable), + Expr::Equation(a) => constant_token!(a, TokenKind::Unlintable), Expr::Math(_) => panic!("Unexpected math outside equation environment."), Expr::MathIdent(_) => panic!("Unexpected math outside equation environment."), Expr::MathShorthand(_) => panic!("Unexpected math outside equation environment."), @@ -199,17 +200,17 @@ fn map_token( Expr::MathPrimes(_) => panic!("Unexpected math outside equation environment."), Expr::MathFrac(_) => panic!("Unexpected math outside equation environment."), Expr::MathRoot(_) => panic!("Unexpected math outside equation environment."), - Expr::Ident(a) => constant_token!(doc, a, TokenKind::Word(WordMetadata::default())), - Expr::None(a) => constant_token!(doc, a, TokenKind::Word(WordMetadata::default())), - Expr::Auto(a) => constant_token!(doc, a, TokenKind::Word(WordMetadata::default())), - Expr::Bool(a) => constant_token!(doc, a, TokenKind::Word(WordMetadata::default())), + Expr::Ident(a) => constant_token!(a, TokenKind::Word(WordMetadata::default())), + Expr::None(a) => constant_token!(a, TokenKind::Word(WordMetadata::default())), + Expr::Auto(a) => constant_token!(a, TokenKind::Word(WordMetadata::default())), + Expr::Bool(a) => constant_token!(a, TokenKind::Word(WordMetadata::default())), Expr::Int(int) => { - constant_token!(doc, int, TokenKind::Number((int.get() as f64).into(), None)) + constant_token!(int, TokenKind::Number((int.get() as f64).into(), None)) } Expr::Float(float) => { - constant_token!(doc, float, TokenKind::Number(float.get().into(), None)) + constant_token!(float, TokenKind::Number(float.get().into(), None)) } - Expr::Numeric(a) => constant_token!(doc, a, TokenKind::Unlintable), + Expr::Numeric(a) => constant_token!(a, TokenKind::Unlintable), Expr::Str(text) => { let offset = doc.range(text.span()).unwrap().start + 1; let text = text.to_untyped().text(); @@ -224,17 +225,15 @@ fn map_token( .collect_vec(), ) } - Expr::Code(a) => constant_token!(doc, a, TokenKind::Unlintable), - Expr::Content(content_block) => { - recursive_env(&mut content_block.body().exprs(), doc, parser) - } - Expr::Parenthesized(parenthesized) => map_token(parenthesized.expr(), doc, parser), + Expr::Code(a) => constant_token!(a, TokenKind::Unlintable), + Expr::Content(content_block) => nested_env(&mut content_block.body().exprs()), + Expr::Parenthesized(parenthesized) => parse_expr(parenthesized.expr(), doc, parser), Expr::Array(array) => Some( array .items() .filter_map(|i| { if let typst_syntax::ast::ArrayItem::Pos(e) = i { - map_token(e, doc, parser) + parse_expr(e, doc, parser) } else { None } @@ -243,66 +242,64 @@ fn map_token( .collect_vec(), ), Expr::Dict(a) => parse_dict(&mut a.items(), doc, parser), - Expr::Unary(a) => constant_token!(doc, a, TokenKind::Unlintable), - Expr::Binary(a) => constant_token!(doc, a, TokenKind::Unlintable), + Expr::Unary(a) => constant_token!(a, TokenKind::Unlintable), + Expr::Binary(a) => constant_token!(a, TokenKind::Unlintable), Expr::FieldAccess(field_access) => merge_expr!( - map_token(field_access.target(), doc, parser), + parse_expr(field_access.target(), doc, parser), constant_token!( - doc, field_access.field(), TokenKind::Word(WordMetadata::default()) ) ), - Expr::FuncCall(a) => constant_token!(doc, a, TokenKind::Unlintable), - Expr::Closure(a) => constant_token!(doc, a, TokenKind::Unlintable), + Expr::FuncCall(a) => constant_token!(a, TokenKind::Unlintable), + Expr::Closure(a) => constant_token!(a, TokenKind::Unlintable), Expr::Let(let_binding) => merge_expr!( match let_binding.kind() { typst_syntax::ast::LetBindingKind::Normal(pattern) => parse_pattern(pattern, doc, parser), typst_syntax::ast::LetBindingKind::Closure(ident) => - constant_token!(doc, ident, TokenKind::Word(WordMetadata::default())), + constant_token!(ident, TokenKind::Word(WordMetadata::default())), }, - let_binding.init().and_then(|e| map_token(e, doc, parser)) + let_binding.init().and_then(|e| parse_expr(e, doc, parser)) ), Expr::DestructAssign(destruct_assignment) => { - map_token(destruct_assignment.value(), doc, parser) + parse_expr(destruct_assignment.value(), doc, parser) } Expr::Set(set_rule) => merge_expr!( - map_token(set_rule.target(), doc, parser), - map_token(set_rule.condition()?, doc, parser) + parse_expr(set_rule.target(), doc, parser), + parse_expr(set_rule.condition()?, doc, parser) ), Expr::Show(show_rule) => merge_expr!( - map_token(show_rule.transform(), doc, parser), - map_token(show_rule.selector()?, doc, parser) + parse_expr(show_rule.transform(), doc, parser), + parse_expr(show_rule.selector()?, doc, parser) ), - Expr::Contextual(contextual) => map_token(contextual.body(), doc, parser), + Expr::Contextual(contextual) => parse_expr(contextual.body(), doc, parser), Expr::Conditional(conditional) => merge_expr!( - map_token(conditional.condition(), doc, parser), - map_token(conditional.if_body(), doc, parser), - map_token(conditional.else_body()?, doc, parser) + parse_expr(conditional.condition(), doc, parser), + parse_expr(conditional.if_body(), doc, parser), + parse_expr(conditional.else_body()?, doc, parser) ), Expr::While(while_loop) => merge_expr!( - map_token(while_loop.condition(), doc, parser), - map_token(while_loop.body(), doc, parser) + parse_expr(while_loop.condition(), doc, parser), + parse_expr(while_loop.body(), doc, parser) ), Expr::For(for_loop) => merge_expr!( - map_token(for_loop.iterable(), doc, parser), - map_token(for_loop.body(), doc, parser) + parse_expr(for_loop.iterable(), doc, parser), + parse_expr(for_loop.body(), doc, parser) ), Expr::Import(module_import) => { merge_expr!( - map_token(module_import.source(), doc, parser), + parse_expr(module_import.source(), doc, parser), constant_token!( - doc, module_import.new_name()?, TokenKind::Word(WordMetadata::default()) ) ) } - Expr::Include(module_include) => map_token(module_include.source(), doc, parser), - Expr::Break(a) => constant_token!(doc, a, TokenKind::Unlintable), - Expr::Continue(a) => constant_token!(doc, a, TokenKind::Unlintable), - Expr::Return(a) => constant_token!(doc, a, TokenKind::Unlintable), + Expr::Include(module_include) => parse_expr(module_include.source(), doc, parser), + Expr::Break(a) => constant_token!(a, TokenKind::Unlintable), + Expr::Continue(a) => constant_token!(a, TokenKind::Unlintable), + Expr::Return(a) => constant_token!(a, TokenKind::Unlintable), } } @@ -326,7 +323,7 @@ impl Parser for Typst { // This is why we keep track above. let mut tokens = typst_tree .exprs() - .filter_map(|ex| map_token(ex, &typst_document, &mut english_parser)) + .filter_map(|ex| parse_expr(ex, &typst_document, &mut english_parser)) .flatten() .collect_vec(); From 299d810f5cf2669a66cd7effdaf9ccd3cb7c668a Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Fri, 27 Dec 2024 13:42:26 -0600 Subject: [PATCH 21/51] feat(#230): support unicode characters --- harper-core/src/parsers/typst.rs | 300 +++++++++++++++++++++++++------ 1 file changed, 247 insertions(+), 53 deletions(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index 6bb572df..798c22bd 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -13,6 +13,61 @@ use crate::{ /// Typst files. pub struct Typst; +#[derive(Debug, Clone, Copy)] +struct Offset<'a> { + doc: &'a typst_syntax::Source, + pub char: usize, + pub byte: usize, +} + +impl<'a> Offset<'a> { + pub fn new(doc: &'a typst_syntax::Source) -> Self { + Self { + doc, + char: 0, + byte: 0, + } + } + + pub fn update_to(&mut self, new_byte: usize) { + assert!(new_byte >= self.byte); + self.char += self.doc.get(self.byte..new_byte).unwrap().chars().count(); + self.byte = new_byte; + } + + pub fn update_by(&mut self, relative_bytes: usize) { + self.char += self + .doc + .get(self.byte..(self.byte + relative_bytes)) + .unwrap() + .chars() + .count(); + self.byte += relative_bytes; + } + + pub fn push_to(self, new_byte: usize) -> Self { + assert!(new_byte >= self.byte); + Self { + char: self.doc.get(self.byte..new_byte).unwrap().chars().count(), + byte: new_byte, + ..self + } + } + + pub fn push_by(self, relative_bytes: usize) -> Self { + let mut new = self; + new.update_by(relative_bytes); + + new + } + + pub fn push_to_span(self, span: typst_syntax::Span) -> Self { + let new_byte = self.doc.range(span).unwrap().start; + + self.push_to(new_byte) + } +} + macro_rules! constant_token { ($doc:ident, $a:expr, $to:expr) => {{ Some(vec![Token { @@ -36,17 +91,15 @@ macro_rules! merge_expr { fn parse_english( str: impl Into, - doc: &typst_syntax::Source, parser: &mut PlainEnglish, - span: &typst_syntax::Span, + offset: Offset, ) -> Option> { - let offset = doc.range(*span).unwrap().start; Some( parser .parse_str(str.into()) .into_iter() .map(|mut t| { - t.span.push_by(offset); + t.span.push_by(offset.char); t }) .collect_vec(), @@ -57,22 +110,38 @@ fn parse_dict( dict: &mut dyn Iterator, doc: &typst_syntax::Source, parser: &mut PlainEnglish, + offset: Offset, ) -> Option> { Some( dict.filter_map(|di| match di { typst_syntax::ast::DictItem::Named(named) => merge_expr!( constant_token!(doc, named.name(), TokenKind::Word(WordMetadata::default())), - parse_expr(named.expr(), doc, parser) + parse_expr( + named.expr(), + doc, + parser, + offset.push_to_span(named.expr().span()) + ) ), typst_syntax::ast::DictItem::Keyed(keyed) => merge_expr!( - parse_expr(keyed.key(), doc, parser), - parse_expr(keyed.expr(), doc, parser) + parse_expr( + keyed.key(), + doc, + parser, + offset.push_to_span(keyed.key().span()) + ), + parse_expr( + keyed.expr(), + doc, + parser, + offset.push_to_span(keyed.expr().span()) + ) ), typst_syntax::ast::DictItem::Spread(spread) => spread.sink_ident().map_or_else( || { - spread - .sink_expr() - .and_then(|expr| parse_expr(expr, doc, parser)) + spread.sink_expr().and_then(|expr| { + parse_expr(expr, doc, parser, offset.push_to_span(expr.span())) + }) }, |ident| constant_token!(doc, ident, TokenKind::Word(WordMetadata::default())), ), @@ -86,22 +155,35 @@ fn parse_pattern( pat: typst_syntax::ast::Pattern, doc: &typst_syntax::Source, parser: &mut PlainEnglish, + offset: Offset, ) -> Option> { match pat { - typst_syntax::ast::Pattern::Normal(expr) => parse_expr(expr, doc, parser), + typst_syntax::ast::Pattern::Normal(expr) => { + parse_expr(expr, doc, parser, offset.push_to_span(expr.span())) + } typst_syntax::ast::Pattern::Placeholder(underscore) => { constant_token!(doc, underscore, TokenKind::Unlintable) } typst_syntax::ast::Pattern::Parenthesized(parenthesized) => merge_expr!( - parse_expr(parenthesized.expr(), doc, parser), - parse_pattern(parenthesized.pattern(), doc, parser) + parse_expr( + parenthesized.expr(), + doc, + parser, + offset.push_to_span(parenthesized.expr().span()) + ), + parse_pattern( + parenthesized.pattern(), + doc, + parser, + offset.push_to_span(parenthesized.pattern().span()) + ) ), typst_syntax::ast::Pattern::Destructuring(destructuring) => Some( destructuring .items() .filter_map(|item| match item { typst_syntax::ast::DestructuringItem::Pattern(pattern) => { - parse_pattern(pattern, doc, parser) + parse_pattern(pattern, doc, parser, offset.push_to_span(pattern.span())) } typst_syntax::ast::DestructuringItem::Named(named) => merge_expr!( constant_token!( @@ -109,14 +191,19 @@ fn parse_pattern( named.name(), TokenKind::Word(WordMetadata::default()) ), - parse_pattern(named.pattern(), doc, parser) + parse_pattern( + named.pattern(), + doc, + parser, + offset.push_to_span(named.pattern().span()) + ) ), typst_syntax::ast::DestructuringItem::Spread(spread) => { spread.sink_ident().map_or_else( || { - spread - .sink_expr() - .and_then(|expr| parse_expr(expr, doc, parser)) + spread.sink_expr().and_then(|expr| { + parse_expr(expr, doc, parser, offset.push_to_span(expr.span())) + }) }, |ident| { constant_token!( @@ -138,6 +225,7 @@ fn parse_expr( ex: typst_syntax::ast::Expr, doc: &typst_syntax::Source, parser: &mut PlainEnglish, + offset: Offset, ) -> Option> { macro_rules! constant_token { ($a:expr, $to:expr) => {{ @@ -147,17 +235,18 @@ fn parse_expr( }]) }}; } - let mut nested_env = |exprs: &mut dyn Iterator| { + let mut nested_env = |exprs: &mut dyn Iterator, + offset: Offset| { Some( exprs - .filter_map(|e| parse_expr(e, doc, parser)) + .filter_map(|e| parse_expr(e, doc, parser, offset)) .flatten() .collect_vec(), ) }; match ex { - Expr::Text(text) => parse_english(text.get(), doc, parser, &text.span()), + Expr::Text(text) => parse_english(text.get(), parser, offset.push_to_span(text.span())), Expr::Space(a) => constant_token!(a, TokenKind::Space(1)), Expr::Linebreak(a) => constant_token!(a, TokenKind::Newline(1)), Expr::Parbreak(a) => constant_token!(a, TokenKind::ParagraphBreak), @@ -173,22 +262,35 @@ fn parse_expr( constant_token!(quote, TokenKind::Punctuation(Punctuation::Apostrophe)) } } - Expr::Strong(strong) => nested_env(&mut strong.body().exprs()), - Expr::Emph(emph) => nested_env(&mut emph.body().exprs()), + Expr::Strong(strong) => nested_env( + &mut strong.body().exprs(), + offset.push_to_span(strong.span()), + ), + Expr::Emph(emph) => nested_env(&mut emph.body().exprs(), offset.push_to_span(emph.span())), Expr::Raw(a) => constant_token!(a, TokenKind::Unlintable), Expr::Link(a) => constant_token!(a, TokenKind::Url), - Expr::Label(label) => parse_english(label.get(), doc, parser, &label.span()), + Expr::Label(label) => parse_english(label.get(), parser, offset.push_to_span(label.span())), Expr::Ref(a) => { constant_token!(a, TokenKind::Word(WordMetadata::default())) } - Expr::Heading(heading) => nested_env(&mut heading.body().exprs()), - Expr::List(list_item) => nested_env(&mut list_item.body().exprs()), - Expr::Enum(enum_item) => nested_env(&mut enum_item.body().exprs()), + Expr::Heading(heading) => nested_env( + &mut heading.body().exprs(), + offset.push_to_span(heading.span()), + ), + Expr::List(list_item) => nested_env( + &mut list_item.body().exprs(), + offset.push_to_span(list_item.span()), + ), + Expr::Enum(enum_item) => nested_env( + &mut enum_item.body().exprs(), + offset.push_to_span(enum_item.span()), + ), Expr::Term(term_item) => nested_env( &mut term_item .term() .exprs() .chain(term_item.description().exprs()), + offset.push_to_span(term_item.span()), ), Expr::Equation(a) => constant_token!(a, TokenKind::Unlintable), Expr::Math(_) => panic!("Unexpected math outside equation environment."), @@ -226,14 +328,22 @@ fn parse_expr( ) } Expr::Code(a) => constant_token!(a, TokenKind::Unlintable), - Expr::Content(content_block) => nested_env(&mut content_block.body().exprs()), - Expr::Parenthesized(parenthesized) => parse_expr(parenthesized.expr(), doc, parser), + Expr::Content(content_block) => nested_env( + &mut content_block.body().exprs(), + offset.push_to_span(content_block.span()), + ), + Expr::Parenthesized(parenthesized) => parse_expr( + parenthesized.expr(), + doc, + parser, + offset.push_to_span(parenthesized.span()), + ), Expr::Array(array) => Some( array .items() .filter_map(|i| { if let typst_syntax::ast::ArrayItem::Pos(e) = i { - parse_expr(e, doc, parser) + parse_expr(e, doc, parser, offset.push_to_span(array.span())) } else { None } @@ -241,11 +351,16 @@ fn parse_expr( .flatten() .collect_vec(), ), - Expr::Dict(a) => parse_dict(&mut a.items(), doc, parser), + Expr::Dict(a) => parse_dict(&mut a.items(), doc, parser, offset.push_to_span(a.span())), Expr::Unary(a) => constant_token!(a, TokenKind::Unlintable), Expr::Binary(a) => constant_token!(a, TokenKind::Unlintable), Expr::FieldAccess(field_access) => merge_expr!( - parse_expr(field_access.target(), doc, parser), + parse_expr( + field_access.target(), + doc, + parser, + offset.push_to_span(field_access.span()) + ), constant_token!( field_access.field(), TokenKind::Word(WordMetadata::default()) @@ -256,47 +371,120 @@ fn parse_expr( Expr::Let(let_binding) => merge_expr!( match let_binding.kind() { typst_syntax::ast::LetBindingKind::Normal(pattern) => - parse_pattern(pattern, doc, parser), + parse_pattern(pattern, doc, parser, offset.push_to_span(pattern.span())), typst_syntax::ast::LetBindingKind::Closure(ident) => constant_token!(ident, TokenKind::Word(WordMetadata::default())), }, - let_binding.init().and_then(|e| parse_expr(e, doc, parser)) + let_binding.init().and_then(|e| parse_expr( + e, + doc, + parser, + offset.push_to_span(e.span()) + )) + ), + Expr::DestructAssign(destruct_assignment) => parse_expr( + destruct_assignment.value(), + doc, + parser, + offset.push_to_span(destruct_assignment.span()), ), - Expr::DestructAssign(destruct_assignment) => { - parse_expr(destruct_assignment.value(), doc, parser) - } Expr::Set(set_rule) => merge_expr!( - parse_expr(set_rule.target(), doc, parser), - parse_expr(set_rule.condition()?, doc, parser) + parse_expr( + set_rule.target(), + doc, + parser, + offset.push_to_span(set_rule.target().span()) + ), + parse_expr( + set_rule.condition()?, + doc, + parser, + offset.push_to_span(set_rule.condition()?.span()) + ) ), Expr::Show(show_rule) => merge_expr!( - parse_expr(show_rule.transform(), doc, parser), - parse_expr(show_rule.selector()?, doc, parser) + parse_expr( + show_rule.transform(), + doc, + parser, + offset.push_to_span(show_rule.transform().span()) + ), + parse_expr( + show_rule.selector()?, + doc, + parser, + offset.push_to_span(show_rule.selector()?.span()) + ) + ), + Expr::Contextual(contextual) => parse_expr( + contextual.body(), + doc, + parser, + offset.push_to_span(contextual.span()), ), - Expr::Contextual(contextual) => parse_expr(contextual.body(), doc, parser), Expr::Conditional(conditional) => merge_expr!( - parse_expr(conditional.condition(), doc, parser), - parse_expr(conditional.if_body(), doc, parser), - parse_expr(conditional.else_body()?, doc, parser) + parse_expr( + conditional.condition(), + doc, + parser, + offset.push_to_span(conditional.condition().span()) + ), + parse_expr( + conditional.if_body(), + doc, + parser, + offset.push_to_span(conditional.if_body().span()) + ), + parse_expr( + conditional.else_body()?, + doc, + parser, + offset.push_to_span(conditional.else_body()?.span()) + ) ), Expr::While(while_loop) => merge_expr!( - parse_expr(while_loop.condition(), doc, parser), - parse_expr(while_loop.body(), doc, parser) + parse_expr( + while_loop.condition(), + doc, + parser, + offset.push_to_span(while_loop.condition().span()) + ), + parse_expr( + while_loop.body(), + doc, + parser, + offset.push_to_span(while_loop.body().span()) + ) ), Expr::For(for_loop) => merge_expr!( - parse_expr(for_loop.iterable(), doc, parser), - parse_expr(for_loop.body(), doc, parser) + parse_expr( + for_loop.iterable(), + doc, + parser, + offset.push_to_span(for_loop.iterable().span()) + ), + parse_expr( + for_loop.body(), + doc, + parser, + offset.push_to_span(for_loop.body().span()) + ) ), Expr::Import(module_import) => { merge_expr!( - parse_expr(module_import.source(), doc, parser), + parse_expr( + module_import.source(), + doc, + parser, + offset.push_to_span(module_import.source().span()) + ), constant_token!( module_import.new_name()?, TokenKind::Word(WordMetadata::default()) ) ) } - Expr::Include(module_include) => parse_expr(module_include.source(), doc, parser), + Expr::Include(a) => constant_token!(a, TokenKind::Unlintable), Expr::Break(a) => constant_token!(a, TokenKind::Unlintable), Expr::Continue(a) => constant_token!(a, TokenKind::Unlintable), Expr::Return(a) => constant_token!(a, TokenKind::Unlintable), @@ -323,7 +511,14 @@ impl Parser for Typst { // This is why we keep track above. let mut tokens = typst_tree .exprs() - .filter_map(|ex| parse_expr(ex, &typst_document, &mut english_parser)) + .filter_map(|ex| { + parse_expr( + ex, + &typst_document, + &mut english_parser, + Offset::new(&typst_document), + ) + }) .flatten() .collect_vec(); @@ -572,7 +767,6 @@ writing"#; }), TokenKind::Space(1), TokenKind::Word(_), - TokenKind::Space(1), ] )); } From 58b3fb0248f099bcf78f94d1f69bb9b8c84cc4e1 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Fri, 27 Dec 2024 13:54:43 -0600 Subject: [PATCH 22/51] fix(#230): use offset struct for constant tokens --- harper-core/src/parsers/typst.rs | 71 ++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 31 deletions(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index 798c22bd..bfc1ef55 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -13,6 +13,7 @@ use crate::{ /// Typst files. pub struct Typst; +/// Encapsulation of the translation between byte-based spans and char-based spans #[derive(Debug, Clone, Copy)] struct Offset<'a> { doc: &'a typst_syntax::Source, @@ -29,22 +30,6 @@ impl<'a> Offset<'a> { } } - pub fn update_to(&mut self, new_byte: usize) { - assert!(new_byte >= self.byte); - self.char += self.doc.get(self.byte..new_byte).unwrap().chars().count(); - self.byte = new_byte; - } - - pub fn update_by(&mut self, relative_bytes: usize) { - self.char += self - .doc - .get(self.byte..(self.byte + relative_bytes)) - .unwrap() - .chars() - .count(); - self.byte += relative_bytes; - } - pub fn push_to(self, new_byte: usize) -> Self { assert!(new_byte >= self.byte); Self { @@ -55,24 +40,33 @@ impl<'a> Offset<'a> { } pub fn push_by(self, relative_bytes: usize) -> Self { - let mut new = self; - new.update_by(relative_bytes); - - new + let new_byte = self.byte + relative_bytes; + Self { + char: self.doc.get(self.byte..new_byte).unwrap().chars().count(), + byte: new_byte, + ..self + } } pub fn push_to_span(self, span: typst_syntax::Span) -> Self { let new_byte = self.doc.range(span).unwrap().start; + assert!(new_byte >= self.byte); self.push_to(new_byte) } } macro_rules! constant_token { - ($doc:ident, $a:expr, $to:expr) => {{ + ($doc:ident, $a:expr, $kind:expr, $offset:expr) => {{ + let start_char_loc = $offset.push_to($doc.range($a.span()).unwrap().start).char; + let end_char_loc = $offset.push_to($doc.range($a.span()).unwrap().end).char; + Some(vec![Token { - span: $doc.range($a.span()).unwrap().into(), - kind: $to, + span: Span { + start: start_char_loc, + end: end_char_loc, + }, + kind: $kind, }]) }}; } @@ -115,7 +109,12 @@ fn parse_dict( Some( dict.filter_map(|di| match di { typst_syntax::ast::DictItem::Named(named) => merge_expr!( - constant_token!(doc, named.name(), TokenKind::Word(WordMetadata::default())), + constant_token!( + doc, + named.name(), + TokenKind::Word(WordMetadata::default()), + offset + ), parse_expr( named.expr(), doc, @@ -143,7 +142,9 @@ fn parse_dict( parse_expr(expr, doc, parser, offset.push_to_span(expr.span())) }) }, - |ident| constant_token!(doc, ident, TokenKind::Word(WordMetadata::default())), + |ident| { + constant_token!(doc, ident, TokenKind::Word(WordMetadata::default()), offset) + }, ), }) .flatten() @@ -162,7 +163,7 @@ fn parse_pattern( parse_expr(expr, doc, parser, offset.push_to_span(expr.span())) } typst_syntax::ast::Pattern::Placeholder(underscore) => { - constant_token!(doc, underscore, TokenKind::Unlintable) + constant_token!(doc, underscore, TokenKind::Unlintable, offset) } typst_syntax::ast::Pattern::Parenthesized(parenthesized) => merge_expr!( parse_expr( @@ -189,7 +190,8 @@ fn parse_pattern( constant_token!( doc, named.name(), - TokenKind::Word(WordMetadata::default()) + TokenKind::Word(WordMetadata::default()), + offset ), parse_pattern( named.pattern(), @@ -209,7 +211,8 @@ fn parse_pattern( constant_token!( doc, ident, - TokenKind::Word(WordMetadata::default()) + TokenKind::Word(WordMetadata::default()), + offset ) }, ) @@ -228,10 +231,16 @@ fn parse_expr( offset: Offset, ) -> Option> { macro_rules! constant_token { - ($a:expr, $to:expr) => {{ + ($a:expr, $kind:expr) => {{ + let start_char_loc = offset.push_to(doc.range($a.span()).unwrap().start).char; + let end_char_loc = offset.push_to(doc.range($a.span()).unwrap().end).char; + Some(vec![Token { - span: doc.range($a.span()).unwrap().into(), - kind: $to, + span: Span { + start: start_char_loc, + end: end_char_loc, + }, + kind: $kind, }]) }}; } From 1776346a8543d120f34aa6e24967a278b2675841 Mon Sep 17 00:00:00 2001 From: Grant Lemons Date: Fri, 27 Dec 2024 15:05:16 -0600 Subject: [PATCH 23/51] feat(#230): mark labels unlintable and add some tests --- harper-core/src/parsers/typst.rs | 63 +++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/harper-core/src/parsers/typst.rs b/harper-core/src/parsers/typst.rs index bfc1ef55..445c77aa 100644 --- a/harper-core/src/parsers/typst.rs +++ b/harper-core/src/parsers/typst.rs @@ -278,7 +278,7 @@ fn parse_expr( Expr::Emph(emph) => nested_env(&mut emph.body().exprs(), offset.push_to_span(emph.span())), Expr::Raw(a) => constant_token!(a, TokenKind::Unlintable), Expr::Link(a) => constant_token!(a, TokenKind::Url), - Expr::Label(label) => parse_english(label.get(), parser, offset.push_to_span(label.span())), + Expr::Label(a) => constant_token!(a, TokenKind::Unlintable), Expr::Ref(a) => { constant_token!(a, TokenKind::Word(WordMetadata::default())) } @@ -704,6 +704,7 @@ mod tests { let source = r#"#let ident = "This is a string""#; let token_kinds = Typst.parse_str(source).iter().map(|t| t.kind).collect_vec(); + dbg!(&token_kinds); assert!(matches!( &token_kinds.as_slice(), @@ -720,6 +721,66 @@ mod tests { )) } + #[test] + fn header_parsing() { + let source = r"= Header + Paragraph"; + + let tokens = Typst.parse_str(source); + let token_kinds = tokens.iter().map(|t| t.kind).collect_vec(); + dbg!(&token_kinds); + + let charslice = source.chars().collect_vec(); + assert_eq!(tokens[0].span.get_content_string(&charslice), "Header"); + assert_eq!(tokens[2].span.get_content_string(&charslice), "Paragraph"); + + assert!(matches!( + &token_kinds.as_slice(), + &[TokenKind::Word(_), TokenKind::Space(1), TokenKind::Word(_)] + )) + } + + #[test] + fn parbreak() { + let source = r"Paragraph + + Paragraph"; + + let token_kinds = Typst.parse_str(source).iter().map(|t| t.kind).collect_vec(); + dbg!(&token_kinds); + + assert!(matches!( + &token_kinds.as_slice(), + &[ + TokenKind::Word(_), + TokenKind::ParagraphBreak, + TokenKind::Word(_), + ] + )) + } + + #[test] + fn label_unlintable() { + let source = r"= Header +