Skip to content
31 changes: 25 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,19 @@ To run with command-line options:
cargo run --release -- --config datafusion-fuzzer.toml --rounds 5 --queries-per-round 20
```

To run with verbose oracle/query logs to stdout:
```bash
RUST_LOG=info cargo run -- --config datafusion-fuzzer.toml --display-logs
```

## Oracles

The runner currently chooses one oracle at random for each test case:

- `NoCrashOracle`: checks for non-whitelisted crashes/errors.
- `TlpWhereOracle`: validates TLP partitioning over `WHERE` (`p`, `NOT p`, `p IS NULL`) via value-level multiset comparison.
- `TlpHavingOracle`: validates TLP partitioning over `HAVING` (`p`, `NOT p`, `p IS NULL`) via value-level multiset comparison.

## Configuration

The fuzzer supports extensive configuration options to customize the fuzzing process.
Expand All @@ -45,18 +58,21 @@ See `datafusion-fuzzer.toml` for an example configuration file:
seed = 42
rounds = 3
queries_per_round = 10
timeout_seconds = 30
timeout_seconds = 2

# Logging settings
display_logs = true
enable_tui = false
# log_path = "logs/datafusion-fuzzer.log"
display_logs = false
enable_tui = true
log_path = "logs"
sample_interval_secs = 5

# Table generation parameters
max_column_count = 5
max_row_count = 100
max_expr_level = 3
max_group_by_count = 3
max_table_count = 3
max_insert_per_table = 20
```

### Command Line Options
Expand All @@ -69,6 +85,8 @@ Options:
-q, --queries-per-round <QUERIES> Number of queries per round
-t, --timeout <TIMEOUT> Query timeout in seconds
-l, --log-path <LOG_PATH> Path to log file
-d, --display-logs Display logs
--enable-tui Enable TUI display
-h, --help Print help
-V, --version Print version
```
Expand All @@ -79,13 +97,14 @@ Options:
- `max_column_count`: Maximum number of columns per generated table (default: 5)
- `max_row_count`: Maximum number of rows per generated table (default: 100)
- `max_expr_level`: Maximum expression nesting level (default: 3)
- `max_group_by_count`: Maximum number of `GROUP BY` expressions (default: 3)

## Progress Tracker
### SQL Features
- [x] where
- [ ] sort + limit, offset
- [ ] aggregate
- [ ] having
- [x] having
- [ ] join
- [ ] union/union all/intersect/except

Expand All @@ -112,4 +131,4 @@ Options:

## License

[MIT](LICENSE)
[MIT](LICENSE)
5 changes: 3 additions & 2 deletions datafusion-fuzzer.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,11 @@ sample_interval_secs = 5
max_column_count = 5
max_row_count = 100
max_expr_level = 3
max_group_by_count = 3
max_table_count = 3
max_insert_per_table = 20

# Supported oracles: NoCrash, NestedQueries, TlpWhere.
# Supported oracles: NoCrash, NestedQueries, TlpWhere, TlpHaving.
# Randomly select one oracle from the configured set for each query.
oracles = ["NoCrash"]
# oracles = ["NoCrash", "NestedQueries", "TlpWhere"]
# oracles = ["NoCrash", "NestedQueries", "TlpWhere", "TlpHaving"]
2 changes: 2 additions & 0 deletions src/cli/runner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,7 @@ mod tests {
max_column_count: 3,
max_row_count: 10,
max_expr_level: 2,
max_group_by_count: 2,
max_table_count: 3,
max_insert_per_table: 20,
oracles: vec![crate::oracle::ConfiguredOracle::NoCrash],
Expand Down Expand Up @@ -599,6 +600,7 @@ mod tests {
max_column_count: 3,
max_row_count: 10,
max_expr_level: 2,
max_group_by_count: 2,
max_table_count: 3,
max_insert_per_table: 20,
oracles: vec![crate::oracle::ConfiguredOracle::NoCrash],
Expand Down
15 changes: 13 additions & 2 deletions src/fuzz_context/runner_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ pub struct RunnerConfig {
pub max_column_count: u64,
pub max_row_count: u64,
pub max_expr_level: u32,
#[serde(default = "RunnerConfig::default_max_group_by_count")]
pub max_group_by_count: u32,
pub max_table_count: u32,
pub max_insert_per_table: u32,
#[serde(default = "RunnerConfig::default_oracles", alias = "oracle")]
Expand Down Expand Up @@ -100,6 +102,10 @@ impl RunnerConfig {
fn default_oracles() -> Vec<ConfiguredOracle> {
vec![ConfiguredOracle::NoCrash]
}

fn default_max_group_by_count() -> u32 {
3
}
}

impl Default for RunnerConfig {
Expand All @@ -116,6 +122,7 @@ impl Default for RunnerConfig {
max_column_count: 5,
max_row_count: 100,
max_expr_level: 3,
max_group_by_count: Self::default_max_group_by_count(),
max_table_count: 3,
max_insert_per_table: 20,
oracles: Self::default_oracles(),
Expand All @@ -142,6 +149,7 @@ sample_interval_secs = 5
max_column_count = 5
max_row_count = 100
max_expr_level = 3
max_group_by_count = 3
max_table_count = 3
max_insert_per_table = 20
oracles = ["NoCrash"]
Expand All @@ -167,9 +175,10 @@ sample_interval_secs = 5
max_column_count = 5
max_row_count = 100
max_expr_level = 3
max_group_by_count = 3
max_table_count = 3
max_insert_per_table = 20
oracles = ["NoCrash", "NestedQueries", "TlpWhere"]
oracles = ["NoCrash", "NestedQueries", "TlpWhere", "TlpHaving"]
"#,
)
.unwrap();
Expand All @@ -179,7 +188,8 @@ oracles = ["NoCrash", "NestedQueries", "TlpWhere"]
vec![
ConfiguredOracle::NoCrash,
ConfiguredOracle::NestedQueries,
ConfiguredOracle::TlpWhere
ConfiguredOracle::TlpWhere,
ConfiguredOracle::TlpHaving
]
);
}
Expand All @@ -199,6 +209,7 @@ sample_interval_secs = 5
max_column_count = 5
max_row_count = 100
max_expr_level = 3
max_group_by_count = 3
max_table_count = 3
max_insert_per_table = 20
oracles = []
Expand Down
7 changes: 7 additions & 0 deletions src/oracle/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
pub(crate) mod oracle_common;
pub mod oracle_impl_nested_queries;
pub mod oracle_impl_no_crash;
pub mod oracle_impl_tlp_having;
pub mod oracle_impl_tlp_where;
pub mod oracle_trait;
#[cfg(test)]
pub(crate) mod test_helpers;

use std::sync::Arc;

Expand All @@ -15,6 +18,7 @@ use crate::fuzz_context::GlobalContext;
// Re-export main types and traits
pub use oracle_impl_nested_queries::NestedQueriesOracle;
pub use oracle_impl_no_crash::NoCrashOracle;
pub use oracle_impl_tlp_having::TlpHavingOracle;
pub use oracle_impl_tlp_where::TlpWhereOracle;
pub use oracle_trait::{Oracle, QueryContext, QueryExecutionResult};

Expand All @@ -26,6 +30,8 @@ pub enum ConfiguredOracle {
NestedQueries,
#[serde(rename = "TlpWhere", alias = "TlpWhereOracle")]
TlpWhere,
#[serde(rename = "TlpHaving", alias = "TlpHavingOracle")]
TlpHaving,
}

impl ConfiguredOracle {
Expand All @@ -34,6 +40,7 @@ impl ConfiguredOracle {
Self::NoCrash => Box::new(NoCrashOracle::new(seed, ctx)),
Self::NestedQueries => Box::new(NestedQueriesOracle::new(seed, ctx)),
Self::TlpWhere => Box::new(TlpWhereOracle::new(seed, ctx)),
Self::TlpHaving => Box::new(TlpHavingOracle::new(seed, ctx)),
}
}
}
87 changes: 87 additions & 0 deletions src/oracle/oracle_common.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,29 @@
use crate::common::{Result, fuzzer_err, util};
use crate::oracle::QueryExecutionResult;

pub(crate) fn validate_binary_tlp_consistency(
results: &[QueryExecutionResult],
oracle_name: &str,
) -> Result<()> {
let result_count = results.len();
if result_count != 2 {
return Err(fuzzer_err(&format!(
"{oracle_name} expects 2 query results, got {result_count}"
)));
}

let num_ok = results.iter().filter(|r| r.result.is_ok()).count();
let num_err = result_count - num_ok;

match num_ok {
2 => validate_value_equivalence(results, 0, 1, oracle_name),
0 => Ok(()),
_ => Err(fuzzer_err(&format!(
"{oracle_name} consistency requires all queries to either succeed or fail; got mixed outcomes (ok={num_ok}, err={num_err})"
))),
}
}

pub(crate) fn validate_value_equivalence(
results: &[QueryExecutionResult],
left_idx: usize,
Expand All @@ -25,3 +48,67 @@ pub(crate) fn validate_value_equivalence(

util::validate_batches_value_equivalence(left_batches, right_batches, oracle_name)
}

pub(crate) fn append_labeled_query_results(
report: &mut String,
results: &[QueryExecutionResult],
labels: &[&str],
) {
for (idx, result) in results.iter().enumerate() {
let label = labels.get(idx).copied().unwrap_or("unknown");
report.push_str(&format!(
"Q{} [{}]:\n{}\n",
idx + 1,
label,
result.query_context.query
));

match &result.result {
Ok(batches) => report.push_str(&format!(
" status: ok, rows={}\n\n",
util::count_total_rows(batches)
)),
Err(e) => report.push_str(&format!(" status: error, details={}\n\n", e)),
}
}
}

pub(crate) fn append_binary_value_equivalence_report(
report: &mut String,
results: &[QueryExecutionResult],
) -> Result<()> {
if results.len() != 2 || results.iter().any(|r| r.result.is_err()) {
return Ok(());
}

let q_all_batches = results[0]
.result
.as_ref()
.map_err(|e| fuzzer_err(&e.to_string()))?;
let q_union_batches = results[1]
.result
.as_ref()
.map_err(|e| fuzzer_err(&e.to_string()))?;

report.push_str(&format!(
"Row counts: all={}, partition_union={}\n",
util::count_total_rows(q_all_batches),
util::count_total_rows(q_union_batches)
));

let all_multiset = util::batches_to_row_multiset(q_all_batches)?;
let partition_multiset = util::batches_to_row_multiset(q_union_batches)?;

if all_multiset != partition_multiset {
report.push_str("\nTop multiset differences:\n");
report.push_str(&util::format_row_multiset_diff(
&all_multiset,
&partition_multiset,
));
report.push('\n');
} else {
report.push_str("Multiset equivalence: true\n");
}

Ok(())
}
Loading
Loading