Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
283 changes: 278 additions & 5 deletions Cargo.lock

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ ddsketch-agent = { git = "https://github.com/DataDog/saluki", rev = "f47a7ef588c
datadog-protos = { git = "https://github.com/DataDog/saluki", rev = "f47a7ef588c53aa1da35dcfd93808595ebeb1291" }
protobuf = { version = "3.7" }
enum_dispatch = { version = "0.3" }
barkus-core = { git = "https://github.com/DataDog/barkus", rev = "45e307e87900a709398fc878940f33ea04507aa2" }
barkus-ebnf = { git = "https://github.com/DataDog/barkus", rev = "45e307e87900a709398fc878940f33ea04507aa2" }
barkus-peg = { git = "https://github.com/DataDog/barkus", rev = "45e307e87900a709398fc878940f33ea04507aa2" }
barkus-antlr = { git = "https://github.com/DataDog/barkus", rev = "45e307e87900a709398fc878940f33ea04507aa2" }

[workspace.lints.clippy]
all = "deny"
Expand Down
29 changes: 28 additions & 1 deletion deny.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,36 @@ allow = [
]
unused-allowed-license = "allow"

# barkus is a DataDog-internal project whose crates do not yet declare a
# license field. The repo itself is MIT-licensed (see its LICENSE file).
[[licenses.clarify]]
name = "barkus-core"
expression = "MIT"
license-files = []

[[licenses.clarify]]
name = "barkus-ebnf"
expression = "MIT"
license-files = []

[[licenses.clarify]]
name = "barkus-peg"
expression = "MIT"
license-files = []

[[licenses.clarify]]
name = "barkus-antlr"
expression = "MIT"
license-files = []

[[licenses.clarify]]
name = "barkus-parser-common"
expression = "MIT"
license-files = []

[sources]
unknown-git = "deny"
allow-git = ["https://github.com/DataDog/saluki"]
allow-git = ["https://github.com/DataDog/saluki", "https://github.com/DataDog/barkus"]

[advisories]
version = 2
Expand Down
15 changes: 15 additions & 0 deletions examples/grammars/json.ebnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
start = value , "\n" ;

value = object | array | string | number | "true" | "false" | "null" ;

object = "{" [ pair { "," pair } ] "}" ;
pair = string ":" value ;

array = "[" [ value { "," value } ] "]" ;

string = '"' { character } '"' ;
character = "a" | "b" | "c" | "d" | "e" | "f" | "x" | "y" | "z"
| "0" | "1" | "2" | "3" ;

number = [ "-" ] digit { digit } [ "." digit { digit } ] ;
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
17 changes: 17 additions & 0 deletions examples/lading-grammar.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
generator:
- tcp:
seed: [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53,
59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131]
addr: "127.0.0.1:8282"
bytes_per_second: "10 MiB"
maximum_prebuild_cache_size_bytes: "64 MiB"
variant:
grammar:
grammar_path: "examples/grammars/json.ebnf"
format: ebnf
max_depth: 20
max_total_nodes: 5000

blackhole:
- tcp:
binding_addr: "0.0.0.0:8282"
6 changes: 6 additions & 0 deletions lading_payload/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,18 @@ tracing = { workspace = true }
tokio = { workspace = true }
arbitrary = { version = "1", optional = true, features = ["derive"] }
enum_dispatch = { workspace = true }
barkus-core = { workspace = true }
barkus-ebnf = { workspace = true }
barkus-peg = { workspace = true }
barkus-antlr = { workspace = true }
rand_0_10 = { package = "rand", version = "0.10", default-features = false }

[dev-dependencies]
proptest = { workspace = true }
proptest-derive = { workspace = true }
criterion = { version = "0.8", features = ["html_reports"] }
rustc-hash = { workspace = true }
tempfile = { workspace = true }

[features]
default = []
Expand Down
106 changes: 106 additions & 0 deletions lading_payload/README.grammar.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Grammar payload

Generates structured data from EBNF, PEG, or ANTLR v4 grammar files using
[barkus](https://github.com/DataDog/barkus). Samples are concatenated directly
with no injected delimiters — if you need newline-delimited output, include a
trailing `\n` in your grammar's start production.

## Using it in a lading config

Reference the variant by name with a `grammar_path` pointing at the grammar
file and a `format` field indicating the grammar type.

**TCP generator** (variant is a top-level field):

```yaml
generator:
- tcp:
seed: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
addr: "127.0.0.1:8080"
bytes_per_second: "10 MiB"
maximum_prebuild_cache_size_bytes: "64 MiB"
variant:
grammar:
grammar_path: "/path/to/json.ebnf"
format: ebnf
max_depth: 20
max_total_nodes: 5000
```

**HTTP generator** (variant is nested under `method.post`):

```yaml
generator:
- http:
seed: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
headers: {}
target_uri: "http://127.0.0.1:8080/"
bytes_per_second: "1 MiB"
parallel_connections: 1
method:
post:
maximum_prebuild_cache_size_bytes: "10 MiB"
variant:
grammar:
grammar_path: "/path/to/sql.g4"
format: antlr
```

## Configuration fields

| Field | Type | Default | Description |
|-------|------|---------|-------------|
| `grammar_path` | path | (required) | Path to the grammar file. Can be absolute or relative to the lading working directory. |
| `format` | string | (required) | Grammar format: `ebnf`, `peg`, or `antlr`. |
| `max_depth` | integer | 30 | Maximum recursion depth for the grammar expansion. Lower values produce smaller, simpler output. |
| `max_total_nodes` | integer | 20000 | Maximum AST nodes per generated sample. Limits the total size of each output. |

## Supported grammar formats

- **EBNF** (`format: ebnf`): ISO/IEC 14977 Extended Backus-Naur Form. Rules
use `=` and terminate with `;`. Repetition with `{ }`, optional with `[ ]`.
- **PEG** (`format: peg`): Parsing Expression Grammars. Rules use `<-` or `=`.
Ordered choice with `/`, quantifiers `?`, `*`, `+`.
- **ANTLR** (`format: antlr`): ANTLR v4 combined or parser grammars (`.g4`
files). Rules use `:` and terminate with `;`. Supports `grammar Name;`
headers and `fragment` rules.

## Example EBNF grammar (simplified JSON)

```ebnf
(* Wrap the start production with a trailing newline for line-delimited output. *)
start = value , "\n" ;

value = object | array | string | number | "true" | "false" | "null" ;

object = "{" [ pair { "," pair } ] "}" ;
pair = string ":" value ;

array = "[" [ value { "," value } ] "]" ;

string = '"' { character } '"' ;
character = "a" | "b" | "c" | "x" | "y" | "z" | "0" | "1" | "2" ;

number = [ "-" ] digit { digit } [ "." digit { digit } ] ;
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
```

This produces output like:

```
{"a1":true,"xz":[false,42,-7.3]}
"abc"
[null,{"y":0}]
```

## Tuning tips

- Start with the defaults (`max_depth: 30`, `max_total_nodes: 20000`). Lower
`max_depth` if you want shallower output or faster generation.
- Increase `max_total_nodes` for grammars with many terminals per sample (e.g.,
large SQL statements).
- If the grammar's start rule requires deep recursion, ensure `max_depth` is at
least as large as the start rule's minimum depth. Lading validates this at
startup and will report an error if the depth budget is too small.
11 changes: 11 additions & 0 deletions lading_payload/src/block.rs
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,17 @@ impl Cache {

construct_block_cache_inner(rng, &mut pyld, maximum_block_bytes, total_bytes.get())?
}
crate::Config::Grammar(config) => {
let mut serializer = crate::Grammar::new(config)?;
let span = span!(Level::INFO, "fixed", payload = "grammar");
let _guard = span.enter();
construct_block_cache_inner(
&mut rng,
&mut serializer,
maximum_block_bytes,
total_bytes.get(),
)?
}
};

let total_cycle_size = blocks
Expand Down
Loading
Loading