hydro-project · jaboatman · Mar 13, 2025 · Mar 13, 2025 · Mar 13, 2025 · Jul 9, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -5,5 +5,15 @@ members = [
   "runtime",
   "tool",
   "example",
-  "common",
+  "common", "types",
 ]
+[workspace.package]
+version = "0.5.0"
+authors = [
+  "Jason Boatman",
+  "Shadaj Laddad <shadaj@users.noreply.github.com>"
+]
+
+[workspace.dependencies]
+tree-sitter = { git = "https://github.com/jaboatman/tree-sitter", branch = "combined" }
+tree-sitter-generate = { git = "https://github.com/jaboatman/tree-sitter", branch = "combined", default-features = false, features = ["load"] }
diff --git a/README.md b/README.md
@@ -1,108 +1,107 @@
-# Rust Sitter
-[![Crates.io](https://img.shields.io/crates/v/rust-sitter)](https://crates.io/crates/rust-sitter)
+# Rust Sitter - Otonoma fork
+**This project is a fork of [rust-sitter](https://github.com/hydro-project/rust-sitter). It has been heavily
+modified in many breaking ways.**
 
 Rust Sitter makes it easy to create efficient parsers in Rust by leveraging the [Tree Sitter](https://tree-sitter.github.io/tree-sitter/) parser generator. With Rust Sitter, you can define your entire grammar with annotations on idiomatic Rust code, and let macros generate the parser and type-safe bindings for you!
 
 ## Installation
 First, add Rust/Tree Sitter to your `Cargo.toml`:
 ```toml
 [dependencies]
-rust-sitter = "0.4.5"
+rust-sitter = { git = "https://github.com/otonoma/rust-sitter" }
 
 [build-dependencies]
-rust-sitter-tool = "0.4.5"
+rust-sitter-tool = { git = "https://github.com/otonoma/rust-sitter" }
 ```
 
-_Note: By default, Rust Sitter uses a fork of Tree Sitter with a pure-Rust runtime to support `wasm32-unknown-unknown`. To use the standard C runtime instead, disable default features and enable the `tree-sitter-standard` feature_
-
 The first step is to configure your `build.rs` to compile and link the generated Tree Sitter parser:
 
 ```rust
 use std::path::PathBuf;
 
 fn main() {
     println!("cargo:rerun-if-changed=src");
-    rust_sitter_tool::build_parsers(&PathBuf::from("src/main.rs"));
+    // Path to the file containing your grammar and any submodules.
+    rust_sitter_tool::build_parsers("src/grammar/mod.rs"));
 }
 ```
 
 ## Defining a Grammar
-Now that we have Rust Sitter added to our project, we can define our grammar. Rust Sitter grammars are defined in annotated Rust modules. First, we define the module that will contain our grammar
-
-```rust
-#[rust_sitter::grammar("arithmetic")]
-mod grammar {
-
-}
-```
+Now that we have Rust Sitter added to our project, we can define our grammar. Rust Sitter grammars are defined in Rust modules. First, we create a module file for the grammar in `src/grammar/mod.rs`. Note, this can be any module, however,
+due to various quirks with the build system it is required that you have one grammar per module, and all types
+in the grammar are defined within it, or a submodule of the module.
 
-Then, inside the module, we can define individual AST nodes. For this simple example, we'll define an expression that can be used in a mathematical expression. Note that we annotate this type as `#[rust_sitter::language]` to indicate that it is the root AST type.
+Then, inside the module, we can define individual AST nodes. For this simple example, we'll define an expression that can be used in a mathematical expression. Note that we annotate this type as `#[language]` to indicate that it is the root AST type.
 
 ```rust
-#[rust_sitter::language]
+// in ./src/grammar/mod.rs
+use rust_sitter::Rule;
+#[derive(Rule)]
+#[language]
 pub enum Expr {
     Number(u32),
     Add(Box<Expr>, Box<Expr>)
 }
 ```
 
-Now that we have the type defined, we must annotate the enum variants to describe how to identify them in the text being parsed. First, we can apply `rust_sitter::leaf` to use a regular expression to match digits corresponding to a number, and define a transformation that parses the resulting string into a `u32`.
+Now that we have the type defined, we must annotate the enum variants to describe how to identify them in the text being parsed. First, we can apply `leaf` to use a regular expression to match digits corresponding to a number.
+The value will try to extract the value using a default extraction for the type. For numeric types, this
+defaults to `FromStr`. You can specify an alternate function using `#[with]`.
 
 ```rust
 Number(
-    #[rust_sitter::leaf(pattern = r"\d+", transform = |v| v.parse().unwrap())]
+    #[leaf(re(r"\d+"))]
     u32,
 )
 ```
 
-For the `Add` variant, things are a bit more complicated. First, we add an extra field corresponding to the `+` that must sit between the two sub-expressions. This can be achieved with `text` parameter of `rust_sitter::leaf`, which instructs the parser to match a specific string. Because we are parsing to `()`, we do not need to provide a transformation.
+For the `Add` variant, things are a bit more complicated. First, we add an extra field corresponding to the `+` that must sit between the two sub-expressions. This can be achieved with `text` or `leaf`, which instructs the parser to match a specific string.
 
 ```rust
 Add(
     Box<Expr>,
-    #[rust_sitter::leaf(text = "+")] (),
+    #[text("+")] (),
     Box<Expr>,
 )
 ```
 
 If we try to compile this grammar, however, we will see ane error due to conflicting parse trees for expressions like `1 + 2 + 3`, which could be parsed as `(1 + 2) + 3` or `1 + (2 + 3)`. We want the former, so we can add a further annotation specifying that we want left-associativity for this rule.
 
 ```rust
-#[rust_sitter::prec_left(1)]
+#[prec_left(1)]
 Add(
     Box<Expr>,
-    #[rust_sitter::leaf(text = "+")] (),
+    #[text("+")] (),
     Box<Expr>,
 )
 ```
 
 All together, our grammar looks like this:
 
 ```rust
-#[rust_sitter::grammar("arithmetic")]
-mod grammar {
-    #[rust_sitter::language]
-    pub enum Expr {
-        Number(
-            #[rust_sitter::leaf(pattern = r"\d+", transform = |v| v.parse().unwrap())]
-            u32,
-        ),
-        #[rust_sitter::prec_left(1)]
-        Add(
-            Box<Expr>,
-            #[rust_sitter::leaf(text = "+")] (),
-            Box<Expr>,
-        )
-    }
+use rust_sitter::Rule;
+#[derive(Rule)]
+#[language]
+pub enum Expr {
+    Number(
+        #[leaf(re(r"\d+"))]
+        u32,
+    ),
+    #[prec_left(1)]
+    Add(
+        Box<Expr>,
+        #[text("+")] (),
+        Box<Expr>,
+    )
 }
 ```
 
 We can then parse text using this grammar:
 
 ```rust
-dbg!(grammar::parse("1+2+3"));
+dbg!(grammar::Expr::parse("1+2+3").into_result());
 /*
-grammar::parse("1+2+3") = Ok(Add(
+grammar::Expr::parse("1+2+3").into_result() = Ok(Add(
     Add(
         Number(
             1,
@@ -123,84 +122,122 @@ grammar::parse("1+2+3") = Ok(Add(
 ## Type Annotations
 Rust Sitter supports a number of annotations that can be applied to type and fields in your grammar. These annotations can be used to control how the parser behaves, and how the resulting AST is constructed.
 
-### `#[rust_sitter::language]`
+### `#[language]`
 This annotation marks the entrypoint for parsing, and determines which AST type will be returned from parsing. Only one type in the grammar can be marked as the entrypoint.
 
 ```rust
-#[rust_sitter::language]
+#[derive(Rule)]
+#[language]
 struct Code {
     ...
 }
 ````
 
-### `#[rust_sitter::extra]`
-This annotation marks a node as extra and can safely be skipped while parsing. This is useful for handling whitespace/newlines/comments.
+### `#[extras(...)]`
+This annotation can be used on the `#[language]` rule to specify a list of extras. These extras are specified
+using the same DSL as `#[leaf(...)]` and `#[text(...)]`. These rules are inserted to the `extras` array in the
+grammar.
 
 ```rust
-#[rust_sitter::extra]
-struct Whitespace {
-    #[rust_sitter::leaf(pattern = r"\s")]
-    _whitespace: (),
+#[derive(Rule)]
+#[language]
+#[extras(
+    re(r"\s") // allows whitespace in the grammar.
+)]
+struct Code {
+    ...
 }
 ```
 
 ## Field Annotations
-### `#[rust_sitter::leaf(...)]`
-The `#[rust_sitter::leaf(...)]` annotation can be used to define a leaf node in the AST. This annotation takes a number of parameters that control how the parser behaves:
-- the `pattern` parameter takes a regular expression that is used to match the text of the leaf node. This parameter is required.
-- the `text` parameter takes a string that is used to match the text of the leaf node. This parameter is mutually exclusive with `pattern`.
-- the `transform` parameter takes a function that is used to transform the matched text (an `&str`) into the desired type. This parameter is optional if the target type is `()`.
+### `#[leaf(...)]` and `#[text(...)]`
+The `#[leaf(...)]` annotation can be used to define a leaf node in the AST.
+`#[text(...)]` is similar, but it does not create a named node in the grammar and cannot be
+extracted. It must always be assigned to `()`.
+
+`leaf` and `text` take an input that looks like the [tree sitter
+DSL](https://tree-sitter.github.io/tree-sitter/creating-parsers/2-the-grammar-dsl.html). The supported rules
+currently are:
+* `choice`
+* `optional`
+* `seq`
+* `re` or `pattern` to specify a regular expression
+* literal text
+
+Others can be added in the future as needed.
 
 `leaf` can either be applied to a field in a struct / enum variant (as seen above), or directly on a type with no fields:
 
 ```rust
-#[rust_sitter::leaf(text = "9")]
+#[derive(Rule)]
+#[leaf("9")]
 struct BigDigit;
 
+#[derive(Rule)]
 enum SmallDigit {
-    #[rust_sitter::leaf(text = "0")]
+    #[leaf("0")]
     Zero,
-    #[rust_sitter::leaf(text = "1")]
+    #[leaf("1")]
     One,
 }
 ```
 
-### `#[rust_sitter::prec(...)]` / `#[rust_sitter::prec_left(...)]` / `#[rust_sitter::prec_right(...)]`
+### `#[prec(...)]` / `#[prec_left(...)]` / `#[prec_right(...)]` / `#[prec_dynamic(...)]`
 This annotation can be used to define a non/left/right-associative operator. This annotation takes a single parameter, which is the precedence level of the operator (higher binds more tightly).
 
-### `#[rust_sitter::skip(...)]`
+### `#[immediate]`
+Usually, whitespace is optional before each token. This attribute means that the token will only match if there is no whitespace.
+
+### `#[skip(...)]`
 This annotation can be used to define a field that does not correspond to anything in the input string, such as some metadata. This annotation takes a single parameter, which is the value that should be used to populate that field at runtime.
 
-### `#[rust_sitter::word]`
-This annotation marks the field as a Tree Sitter [word](https://tree-sitter.github.io/tree-sitter/creating-parsers#keywords), which is useful when handling errors involving keywords. Only one field in the grammar can be marked as a word.
+### `#[word]`
+This annotation marks the field as a Tree Sitter [word](https://tree-sitter.github.io/tree-sitter/creating-parsers#keywords), which is useful when handling errors involving keywords. Like `#[extras]`, the `#[word]` is specified on the `#[language]` implementation:
+
+```rust
+#[derive(Debug, Rule)]
+#[language]
+#[word(Ident)]
+pub struct Language {
+    // ...
+}
+
+#[derive(Rule)]
+#[leaf(re(r"[a-zA-Z_]+"))]
+pub struct Ident;
+```
+
+## Partial AST and Errors
+rust-sitter, like tree-sitter, can produce a partial AST along with its errors. Calling `Language::parse` will
+produce a `ParseResult` object which includes as much of the AST as it was able to extract, as well as a `Vec`
+of all of the parsing errors encountered. This is useful for language servers and other contexts which can
+make use of a partial AST. Currently this may not produce the _maximal_ AST, but this may be possible
+in the future.
 
 ## Special Types
 Rust Sitter has a few special types that can be used to define more complex grammars.
 
 ### `Vec<T>`
 To parse repeating structures, you can use a `Vec<T>` to parse a list of `T`s. Note that the `Vec<T>` type **cannot** be wrapped in another `Vec` (create additional structs if this is necessary). There are two special attributes that can be applied to a `Vec` field to control the parsing behavior.
 
-The `#[rust_sitter::delimited(...)]` attribute can be used to specify a separator between elements of the list, and takes a parameter of the same format as an unnamed field. For example, we can define a grammar that parses a comma-separated list of expressions:
+The `#[sep_by(...)]` attribute can be used to specify a separator between elements of the
+list. This is parsed in the same way as `text` and `leaf` and therefore supports all of the listed tree-sitter
+grammar above.
 
 ```rust
 pub struct CommaSeparatedExprs {
-    #[rust_sitter::delimited(
-        #[rust_sitter::leaf(text = ",")]
-        ()
-    )]
+    #[sep_by(",")]
     numbers: Vec<Expr>,
 }
 ```
 
-The `#[rust_sitter::repeat(...)]` attribute can be used to specify additional configuration for the parser. Currently, there is only one available parameter: `non_empty`, which takes a boolean that specifies if the list must contain at least one element. For example, we can define a grammar that parses a non-empty comma-separated list of numbers:
+The `#[repeat1]` can be used to specify that the list must contain at least, or you can use `#[sep_by1(...)]
 
 ```rust
 pub struct CommaSeparatedExprs {
-    #[rust_sitter::repeat(non_empty = true)]
-    #[rust_sitter::delimited(
-        #[rust_sitter::leaf(text = ",")]
-        ()
-    )]
+    #[repeat1]
+    #[sep_by(",")]
+    // Or just use #[sep_by1(",")]
     numbers: Vec<Expr>,
 }
 ```
@@ -210,11 +247,7 @@ To parse optional structures, you can use an `Option<T>` to parse a single `T` o
 
 ```rust
 pub struct CommaSeparatedExprs {
-    #[rust_sitter::repeat(non_empty = true)]
-    #[rust_sitter::delimited(
-        #[rust_sitter::leaf(text = ",")]
-        ()
-    )]
+    #[sep_by1(",")]
     numbers: Vec<Option<Expr>>,
 }
 ```
@@ -224,18 +257,10 @@ When using Rust Sitter to power diagnostic tools, it can be helpful to access sp
 
 ```rust
 pub struct CommaSeparatedExprs {
-    #[rust_sitter::repeat(non_empty = true)]
-    #[rust_sitter::delimited(
-        #[rust_sitter::leaf(text = ",")]
-        ()
-    )]
+    #[sep_by1(",")]
     numbers: Vec<Option<Spanned<Expr>>>,
 }
 ```
 
 ### `Box<T>`
 Boxes are automatically constructed around the inner type when parsing, but Rust Sitter doesn't do anything extra beyond that.
-
-## Debugging
-
-To view the generated grammar, you can set the `RUST_SITTER_EMIT_ARTIFACTS` environment variable to `true`. This will cause the generated grammar to be written to wherever cargo sets `OUT_DIR` (usually `target/debug/build/<crate>-<hash>/out`).
diff --git a/common/Cargo.toml b/common/Cargo.toml
@@ -3,9 +3,9 @@ name = "rust-sitter-common"
 description = "Shared logic for the Rust Sitter macro and tool"
 readme = "../README.md"
 repository = "https://github.com/hydro-project/rust-sitter"
-version = "0.4.5"
-authors = ["Shadaj Laddad <shadaj@users.noreply.github.com>"]
-edition = "2021"
+version.workspace = true
+authors.workspace = true
+edition = "2024"
 license = "MIT"
 keywords = ["parsing", "codegen"]
 categories = ["development-tools"]
@@ -14,5 +14,10 @@ categories = ["development-tools"]
 path = "src/lib.rs"
 
 [dependencies]
+rust-sitter-types = { path = "../types" }
 syn = { version = "2", features = [ "full", "extra-traits" ] }
+proc-macro2 = "1"
 quote = "1"
+
+serde_json = "1"
+itertools = "0.14"