From c640af84f3fa3d51e18fc4cdcbcab8b90e84e96b Mon Sep 17 00:00:00 2001 From: Jean Mertz Date: Mon, 17 Jun 2024 14:26:03 +0200 Subject: [PATCH 1/3] support regex matching against specific keys Signed-off-by: Jean Mertz --- aw-query/src/datatype.rs | 110 +++++++++++++++++++++-------------- aw-transform/src/classify.rs | 30 ++++++++++ 2 files changed, 97 insertions(+), 43 deletions(-) diff --git a/aw-query/src/datatype.rs b/aw-query/src/datatype.rs index c31355dd..7bfa42d3 100644 --- a/aw-query/src/datatype.rs +++ b/aw-query/src/datatype.rs @@ -4,7 +4,7 @@ use std::fmt; use super::functions; use super::QueryError; use aw_models::Event; -use aw_transform::classify::{RegexRule, Rule}; +use aw_transform::classify::{KeyValueRule, RegexRule, Rule}; use serde::{Serialize, Serializer}; use serde_json::value::Value; @@ -297,50 +297,74 @@ impl TryFrom<&DataType> for Rule { )) } }; - if rtype == "none" { - Ok(Self::None) - } else if rtype == "regex" { - let regex_val = match obj.get("regex") { - Some(regex_val) => regex_val, - None => { - return Err(QueryError::InvalidFunctionParameters( - "regex rule is missing the 'regex' field".to_string(), - )) - } - }; - let regex_str = match regex_val { - DataType::String(s) => s, - _ => { - return Err(QueryError::InvalidFunctionParameters( - "the regex field of the regex rule is not a string".to_string(), - )) - } - }; - let ignore_case_val = match obj.get("ignore_case") { - Some(case_val) => case_val, - None => &DataType::Bool(false), - }; - let ignore_case = match ignore_case_val { - DataType::Bool(b) => b, - _ => { + + match rtype.as_str() { + "none" => Ok(Self::None), + "regex" => parse_regex_rule(obj), + "keyvalue" => { + let Some(rules) = obj.get("rules") else { return Err(QueryError::InvalidFunctionParameters( - "the ignore_case field of the regex rule is not a bool".to_string(), - )) - } - }; - let regex_rule = match RegexRule::new(regex_str, *ignore_case) { - Ok(regex_rule) => regex_rule, - Err(err) => { - return Err(QueryError::RegexCompileError(format!( - "Failed to compile regex string '{regex_str}': '{err:?}" - ))) - } - }; - Ok(Self::Regex(regex_rule)) - } else { - Err(QueryError::InvalidFunctionParameters(format!( + "keyval rule is missing the 'rules' field".to_string(), + )); + }; + + let rules = match rules { + DataType::Dict(rules) => rules + .iter() + .map(|(k, v)| Rule::try_from(v).map(|v| (k.to_owned(), v))) + .collect::, _>>()?, + _ => { + return Err(QueryError::InvalidFunctionParameters( + "the rules field of the keyval rule is not a dict".to_string(), + )) + } + }; + + Ok(Rule::KeyValue(KeyValueRule::new(rules))) + } + _ => Err(QueryError::InvalidFunctionParameters(format!( "Unknown rule type '{rtype}'" - ))) + ))), } } } + +fn parse_regex_rule(obj: &HashMap) -> Result { + let regex_val = match obj.get("regex") { + Some(regex_val) => regex_val, + None => { + return Err(QueryError::InvalidFunctionParameters( + "regex rule is missing the 'regex' field".to_string(), + )) + } + }; + let regex_str = match regex_val { + DataType::String(s) => s, + _ => { + return Err(QueryError::InvalidFunctionParameters( + "the regex field of the regex rule is not a string".to_string(), + )) + } + }; + let ignore_case_val = match obj.get("ignore_case") { + Some(case_val) => case_val, + None => &DataType::Bool(false), + }; + let ignore_case = match ignore_case_val { + DataType::Bool(b) => b, + _ => { + return Err(QueryError::InvalidFunctionParameters( + "the ignore_case field of the regex rule is not a bool".to_string(), + )) + } + }; + let regex_rule = match RegexRule::new(regex_str, *ignore_case) { + Ok(regex_rule) => regex_rule, + Err(err) => { + return Err(QueryError::RegexCompileError(format!( + "Failed to compile regex string '{regex_str}': '{err:?}" + ))) + } + }; + Ok(Rule::Regex(regex_rule)) +} diff --git a/aw-transform/src/classify.rs b/aw-transform/src/classify.rs index 114650d6..8caaeaf1 100644 --- a/aw-transform/src/classify.rs +++ b/aw-transform/src/classify.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; + /// Transforms for classifying (tagging and categorizing) events. /// /// Based on code in aw_research: https://github.com/ActivityWatch/aw-research/blob/master/aw_research/classify.py @@ -7,6 +9,7 @@ use fancy_regex::Regex; pub enum Rule { None, Regex(RegexRule), + KeyValue(KeyValueRule), } impl RuleTrait for Rule { @@ -14,6 +17,7 @@ impl RuleTrait for Rule { match self { Rule::None => false, Rule::Regex(rule) => rule.matches(event), + Rule::KeyValue(rule) => rule.matches(event), } } } @@ -62,6 +66,32 @@ impl From for Rule { } } +pub struct KeyValueRule { + rules: HashMap, +} + +impl KeyValueRule { + pub fn new(rules: HashMap) -> Self { + Self { rules } + } +} + +impl RuleTrait for KeyValueRule { + fn matches(&self, event: &Event) -> bool { + self.rules.iter().all(|(key, rule)| { + event + .data + .get(key) + .filter(|_| { + let mut ev = event.clone(); + ev.data.retain(|k, _| k == key); + rule.matches(&ev) + }) + .is_some() + }) + } +} + /// Categorizes a list of events /// /// An event can only have one category, although the category may have a hierarchy, From e830395b8bc062c8ed4b3d49f26004dc62372d2e Mon Sep 17 00:00:00 2001 From: Jean Mertz Date: Mon, 17 Jun 2024 16:12:46 +0200 Subject: [PATCH 2/3] support multiple rules per category Signed-off-by: Jean Mertz --- aw-query/src/datatype.rs | 29 +++++++++++++++++++++++- aw-transform/src/classify.rs | 43 +++++++++++++++++++++++++++++++++++- 2 files changed, 70 insertions(+), 2 deletions(-) diff --git a/aw-query/src/datatype.rs b/aw-query/src/datatype.rs index 7bfa42d3..d4105464 100644 --- a/aw-query/src/datatype.rs +++ b/aw-query/src/datatype.rs @@ -1,10 +1,11 @@ use std::collections::HashMap; use std::fmt; +use std::str::FromStr as _; use super::functions; use super::QueryError; use aw_models::Event; -use aw_transform::classify::{KeyValueRule, RegexRule, Rule}; +use aw_transform::classify::{KeyValueRule, LogicalOperator, LogicalRule, RegexRule, Rule}; use serde::{Serialize, Serializer}; use serde_json::value::Value; @@ -300,6 +301,32 @@ impl TryFrom<&DataType> for Rule { match rtype.as_str() { "none" => Ok(Self::None), + "or" | "and" => { + let Some(rules) = obj.get("rules") else { + return Err(QueryError::InvalidFunctionParameters(format!( + "{} rule is missing the 'rules' field", + rtype + ))); + }; + + let rules = match rules { + DataType::List(rules) => rules + .iter() + .map(Rule::try_from) + .collect::, _>>()?, + _ => { + return Err(QueryError::InvalidFunctionParameters(format!( + "the rules field of the {} rule is not a list", + rtype + ))) + } + }; + + let operator = LogicalOperator::from_str(rtype) + .map_err(QueryError::InvalidFunctionParameters)?; + + Ok(Rule::Logical(LogicalRule::new(rules, operator))) + } "regex" => parse_regex_rule(obj), "keyvalue" => { let Some(rules) = obj.get("rules") else { diff --git a/aw-transform/src/classify.rs b/aw-transform/src/classify.rs index 8caaeaf1..2897825b 100644 --- a/aw-transform/src/classify.rs +++ b/aw-transform/src/classify.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::{collections::HashMap, str::FromStr}; /// Transforms for classifying (tagging and categorizing) events. /// @@ -8,6 +8,7 @@ use fancy_regex::Regex; pub enum Rule { None, + Logical(LogicalRule), Regex(RegexRule), KeyValue(KeyValueRule), } @@ -16,6 +17,7 @@ impl RuleTrait for Rule { fn matches(&self, event: &Event) -> bool { match self { Rule::None => false, + Rule::Logical(rule) => rule.matches(event), Rule::Regex(rule) => rule.matches(event), Rule::KeyValue(rule) => rule.matches(event), } @@ -92,6 +94,45 @@ impl RuleTrait for KeyValueRule { } } +pub enum LogicalOperator { + Or, + And, +} + +impl FromStr for LogicalOperator { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "or" => Ok(Self::Or), + "and" => Ok(Self::And), + _ => Err(format!("Invalid logical operator: {}", s)), + } + } +} + +pub struct LogicalRule { + rules: Vec, + operator: LogicalOperator, +} + +impl LogicalRule { + pub fn new(rules: Vec, operator: LogicalOperator) -> Self { + Self { rules, operator } + } +} + +impl RuleTrait for LogicalRule { + fn matches(&self, event: &Event) -> bool { + use LogicalOperator::{And, Or}; + + match self.operator { + Or => self.rules.iter().any(|rule| rule.matches(event)), + And => self.rules.iter().all(|rule| rule.matches(event)), + } + } +} + /// Categorizes a list of events /// /// An event can only have one category, although the category may have a hierarchy, From d83de8a5687930a0d389e9194337da533809d16b Mon Sep 17 00:00:00 2001 From: Jean Mertz Date: Mon, 17 Jun 2024 16:59:45 +0200 Subject: [PATCH 3/3] simplify implementation Signed-off-by: Jean Mertz --- aw-query/src/datatype.rs | 88 ++++++++++++++++-------------------- aw-transform/src/classify.rs | 52 +++++++-------------- 2 files changed, 56 insertions(+), 84 deletions(-) diff --git a/aw-query/src/datatype.rs b/aw-query/src/datatype.rs index d4105464..dd1affd2 100644 --- a/aw-query/src/datatype.rs +++ b/aw-query/src/datatype.rs @@ -5,7 +5,7 @@ use std::str::FromStr as _; use super::functions; use super::QueryError; use aw_models::Event; -use aw_transform::classify::{KeyValueRule, LogicalOperator, LogicalRule, RegexRule, Rule}; +use aw_transform::classify::{LogicalOperator, LogicalRule, RegexRule, Rule}; use serde::{Serialize, Serializer}; use serde_json::value::Value; @@ -301,54 +301,8 @@ impl TryFrom<&DataType> for Rule { match rtype.as_str() { "none" => Ok(Self::None), - "or" | "and" => { - let Some(rules) = obj.get("rules") else { - return Err(QueryError::InvalidFunctionParameters(format!( - "{} rule is missing the 'rules' field", - rtype - ))); - }; - - let rules = match rules { - DataType::List(rules) => rules - .iter() - .map(Rule::try_from) - .collect::, _>>()?, - _ => { - return Err(QueryError::InvalidFunctionParameters(format!( - "the rules field of the {} rule is not a list", - rtype - ))) - } - }; - - let operator = LogicalOperator::from_str(rtype) - .map_err(QueryError::InvalidFunctionParameters)?; - - Ok(Rule::Logical(LogicalRule::new(rules, operator))) - } + "or" | "and" => parse_logical_rule(obj, rtype), "regex" => parse_regex_rule(obj), - "keyvalue" => { - let Some(rules) = obj.get("rules") else { - return Err(QueryError::InvalidFunctionParameters( - "keyval rule is missing the 'rules' field".to_string(), - )); - }; - - let rules = match rules { - DataType::Dict(rules) => rules - .iter() - .map(|(k, v)| Rule::try_from(v).map(|v| (k.to_owned(), v))) - .collect::, _>>()?, - _ => { - return Err(QueryError::InvalidFunctionParameters( - "the rules field of the keyval rule is not a dict".to_string(), - )) - } - }; - - Ok(Rule::KeyValue(KeyValueRule::new(rules))) - } _ => Err(QueryError::InvalidFunctionParameters(format!( "Unknown rule type '{rtype}'" ))), @@ -356,6 +310,33 @@ impl TryFrom<&DataType> for Rule { } } +fn parse_logical_rule(obj: &HashMap, rtype: &String) -> Result { + let Some(rules) = obj.get("rules") else { + return Err(QueryError::InvalidFunctionParameters(format!( + "{} rule is missing the 'rules' field", + rtype + ))); + }; + + let rules = match rules { + DataType::List(rules) => rules + .iter() + .map(Rule::try_from) + .collect::, _>>()?, + _ => { + return Err(QueryError::InvalidFunctionParameters(format!( + "the rules field of the {} rule is not a list", + rtype + ))) + } + }; + + let operator = + LogicalOperator::from_str(rtype).map_err(QueryError::InvalidFunctionParameters)?; + + Ok(Rule::Logical(LogicalRule::new(rules, operator))) +} + fn parse_regex_rule(obj: &HashMap) -> Result { let regex_val = match obj.get("regex") { Some(regex_val) => regex_val, @@ -385,7 +366,16 @@ fn parse_regex_rule(obj: &HashMap) -> Result )) } }; - let regex_rule = match RegexRule::new(regex_str, *ignore_case) { + let match_field = match obj.get("field") { + Some(DataType::String(v)) => Some(v.to_owned()), + None => None, + _ => { + return Err(QueryError::InvalidFunctionParameters( + "the `field` field of the regex rule is not a string".to_string(), + )) + } + }; + let regex_rule = match RegexRule::new(regex_str, *ignore_case, match_field) { Ok(regex_rule) => regex_rule, Err(err) => { return Err(QueryError::RegexCompileError(format!( diff --git a/aw-transform/src/classify.rs b/aw-transform/src/classify.rs index 2897825b..46fdcbcf 100644 --- a/aw-transform/src/classify.rs +++ b/aw-transform/src/classify.rs @@ -1,4 +1,4 @@ -use std::{collections::HashMap, str::FromStr}; +use std::str::FromStr; /// Transforms for classifying (tagging and categorizing) events. /// @@ -10,7 +10,6 @@ pub enum Rule { None, Logical(LogicalRule), Regex(RegexRule), - KeyValue(KeyValueRule), } impl RuleTrait for Rule { @@ -19,7 +18,6 @@ impl RuleTrait for Rule { Rule::None => false, Rule::Logical(rule) => rule.matches(event), Rule::Regex(rule) => rule.matches(event), - Rule::KeyValue(rule) => rule.matches(event), } } } @@ -30,10 +28,15 @@ trait RuleTrait { pub struct RegexRule { regex: Regex, + field: Option, } impl RegexRule { - pub fn new(regex_str: &str, ignore_case: bool) -> Result { + pub fn new( + regex_str: &str, + ignore_case: bool, + field: Option, + ) -> Result { // can't use `RegexBuilder::case_insensitive` because it's not supported by fancy_regex, // so we need to prefix with `(?i)` to make it case insensitive. let regex = if ignore_case { @@ -43,7 +46,7 @@ impl RegexRule { Regex::new(regex_str)? }; - Ok(RegexRule { regex }) + Ok(RegexRule { regex, field }) } } @@ -56,40 +59,19 @@ impl RuleTrait for RegexRule { fn matches(&self, event: &Event) -> bool { event .data - .values() - .filter(|val| val.is_string()) - .any(|val| self.regex.is_match(val.as_str().unwrap()).unwrap()) + .iter() + .filter(|(field, val)| { + self.field.as_ref().map(|v| &v == field).unwrap_or(true) && val.is_string() + }) + .any(|(_, val)| self.regex.is_match(val.as_str().unwrap()).unwrap()) } } impl From for Rule { fn from(re: Regex) -> Self { - Rule::Regex(RegexRule { regex: re }) - } -} - -pub struct KeyValueRule { - rules: HashMap, -} - -impl KeyValueRule { - pub fn new(rules: HashMap) -> Self { - Self { rules } - } -} - -impl RuleTrait for KeyValueRule { - fn matches(&self, event: &Event) -> bool { - self.rules.iter().all(|(key, rule)| { - event - .data - .get(key) - .filter(|_| { - let mut ev = event.clone(); - ev.data.retain(|k, _| k == key); - rule.matches(&ev) - }) - .is_some() + Rule::Regex(RegexRule { + regex: re, + field: None, }) } } @@ -206,7 +188,7 @@ fn test_rule() { .insert("nonono".into(), serde_json::json!("no match!")); let rule_from_regex = Rule::from(Regex::new("test").unwrap()); - let rule_from_new = Rule::Regex(RegexRule::new("test", false).unwrap()); + let rule_from_new = Rule::Regex(RegexRule::new("test", false, None).unwrap()); let rule_none = Rule::None; assert!(rule_from_regex.matches(&e_match)); assert!(rule_from_new.matches(&e_match));