@@ -8,6 +8,11 @@ pub enum ErrorPattern {
88 Contains ( & ' static str ) ,
99 /// Regex pattern match - checks if the error message matches this regex pattern
1010 RegexMatch ( & ' static str ) ,
11+ /// Combined condition: query SQL contains a substring AND error contains a substring
12+ QueryAndErrorContains {
13+ query_sub : & ' static str ,
14+ error_sub : & ' static str ,
15+ } ,
1116}
1217
1318/// Configuration for error whitelist patterns
@@ -76,9 +81,34 @@ static ERROR_PATTERNS: LazyLock<Vec<ErrorPattern>> = LazyLock::new(|| {
7681 ErrorPattern :: Contains ( "Failed to create view" ) ,
7782 // Null - Null
7883 ErrorPattern :: Contains ( "Cannot get result type for null arithmetic Null - Null" ) ,
79- ErrorPattern :: Contains ( "regex parse error" ) ,
84+ // Only whitelist regex parse errors when query uses regexp-related function
85+ ErrorPattern :: QueryAndErrorContains {
86+ query_sub: "regexp_replace(" ,
87+ error_sub: "regex parse error" ,
88+ } ,
8089 // Invalid JOIN ON expression like '... t1 natural join t2 on true'
8190 ErrorPattern :: Contains ( "SQL error: ParserError(\" Expected: end of statement, found: ON\" )" ) ,
91+ // For anti joins, the fuzzer might generate join predicates that referencing
92+ // eliminated columns from anti joins, example (note t0.flag is a valid column
93+ // from t0, but it's eliminated by the first RIGHT ANTI JOIN):
94+ // SELECT *
95+ // FROM t0
96+ // RIGHT ANTI JOIN t1 ON TRUE
97+ // RIGHT ANTI JOIN t2 ON t0.flag;
98+ ErrorPattern :: QueryAndErrorContains {
99+ query_sub: "ANTI JOIN" ,
100+ error_sub: "Schema error: No field named" ,
101+ } ,
102+ ErrorPattern :: QueryAndErrorContains {
103+ query_sub: "to_date(" ,
104+ error_sub: "Casting from" ,
105+ } ,
106+ ErrorPattern :: QueryAndErrorContains {
107+ query_sub: "to_char(" ,
108+ error_sub: "Cannot cast" ,
109+ } ,
110+ ErrorPattern :: Contains ( "Regular expression did not compile" ) ,
111+ ErrorPattern :: Contains ( "to_unixtime function unsupported data type" ) ,
82112 // =========================
83113 // Known Issues
84114 // =========================
@@ -97,10 +127,17 @@ static ERROR_PATTERNS: LazyLock<Vec<ErrorPattern>> = LazyLock::new(|| {
97127 ErrorPattern :: Contains ( "Invalid arithmetic operation: Null % Null" ) ,
98128 // https://github.com/apache/datafusion/issues/17390
99129 ErrorPattern :: Contains ( "Schema error: No field named" ) ,
130+ // https://github.com/apache/datafusion/issues/17472
131+ ErrorPattern :: Contains ( "to_local_time" ) ,
100132 // =========================
101133 // Investigate Later
102134 // =========================
103135 ErrorPattern :: Contains ( "Cast error: Format error" ) ,
136+ ErrorPattern :: Contains ( "to_date" ) ,
137+ // This is function taking a invalid regex, but triggered a confusing optimizer
138+ // error -- I think the best thing to do is provide better error message
139+ ErrorPattern :: Contains ( "Optimizer rule 'simplify_expressions' failed" ) ,
140+ ErrorPattern :: Contains ( "to_timestamp" ) ,
104141 ]
105142} ) ;
106143
@@ -117,6 +154,7 @@ static COMPILED_REGEXES: LazyLock<Vec<Option<Regex>>> = LazyLock::new(|| {
117154 None
118155 }
119156 } ,
157+ ErrorPattern :: QueryAndErrorContains { .. } => None ,
120158 } )
121159 . collect ( )
122160} ) ;
@@ -128,6 +166,7 @@ static COMPILED_REGEXES: LazyLock<Vec<Option<Regex>>> = LazyLock::new(|| {
128166///
129167/// # Arguments
130168/// * `error_msg` - The error message to check
169+ /// * `query_sql` - The SQL text for the query that produced the error, if available
131170///
132171/// # Returns
133172/// * `true` if the error message matches any whitelisted pattern
@@ -138,13 +177,13 @@ static COMPILED_REGEXES: LazyLock<Vec<Option<Regex>>> = LazyLock::new(|| {
138177/// use datafusion_fuzzer::cli::error_whitelist::is_error_whitelisted;
139178///
140179/// // These should match if the patterns are configured
141- /// assert!(is_error_whitelisted("Query failed: Arrow error: Divide by zero error"));
142- /// assert!(is_error_whitelisted("Some context: Arrow error: Divide by zero error here"));
180+ /// assert!(is_error_whitelisted("Query failed: Arrow error: Divide by zero error", None ));
181+ /// assert!(is_error_whitelisted("Some context: Arrow error: Divide by zero error here", None ));
143182///
144183/// // This should not match
145- /// assert!(!is_error_whitelisted("Unexpected segmentation fault"));
184+ /// assert!(!is_error_whitelisted("Unexpected segmentation fault", None ));
146185/// ```
147- pub fn is_error_whitelisted ( error_msg : & str ) -> bool {
186+ pub fn is_error_whitelisted ( error_msg : & str , query_sql : Option < & str > ) -> bool {
148187 for ( i, pattern) in ERROR_PATTERNS . iter ( ) . enumerate ( ) {
149188 match pattern {
150189 ErrorPattern :: Contains ( exact_str) => {
@@ -159,6 +198,16 @@ pub fn is_error_whitelisted(error_msg: &str) -> bool {
159198 }
160199 }
161200 }
201+ ErrorPattern :: QueryAndErrorContains {
202+ query_sub,
203+ error_sub,
204+ } => {
205+ if let Some ( sql) = query_sql {
206+ if sql. contains ( query_sub) && error_msg. contains ( error_sub) {
207+ return true ;
208+ }
209+ }
210+ }
162211 }
163212 }
164213
@@ -172,6 +221,15 @@ pub fn get_configured_patterns() -> Vec<String> {
172221 . map ( |pattern| match pattern {
173222 ErrorPattern :: Contains ( s) => format ! ( "Exact: {}" , s) ,
174223 ErrorPattern :: RegexMatch ( s) => format ! ( "Regex: {}" , s) ,
224+ ErrorPattern :: QueryAndErrorContains {
225+ query_sub,
226+ error_sub,
227+ } => {
228+ format ! (
229+ "QueryAndError: query contains '{}' AND error contains '{}'" ,
230+ query_sub, error_sub
231+ )
232+ }
175233 } )
176234 . collect ( )
177235}
0 commit comments