Skip to content

Commit ac5a79f

Browse files
committed
Adding regex replacement feature
1 parent c559db8 commit ac5a79f

File tree

3 files changed

+80
-24
lines changed

3 files changed

+80
-24
lines changed

README.md

+48-24
Original file line numberDiff line numberDiff line change
@@ -155,30 +155,31 @@ cargo run --release -- -l en -d ../texts/ extract-file >> file.en.txt
155155

156156
The following rules can be configured per language. Add a `<language>.toml` file in the `rules` directory to enable a new locale. Note that the `replacements` get applied before any other rules are checked.
157157

158-
| Name | Description | Values | Default |
159-
|--------|-----------------------|---------|---------|
160-
| abbreviation_patterns | Regex defining abbreviations | Rust Regex Array | all abbreviations allowed
161-
| allowed_symbols_regex | Regex of allowed symbols or letters. Each character gets matched against this pattern. | String Array | not used
162-
| broken_whitespace | Array of broken whitespaces. This could for example disallow two spaces following each other | String Array | all types of whitespaces allowed
163-
| disallowed_symbols | Use `allowed_symbols_regex` instead. Array of disallowed symbols or letters. Only used when allowed_symbols_regex is not set or is an empty String. | String Array | all symbols allowed
164-
| disallowed_words | Array of disallowed words. Prefer the blocklist approach when possible. | String Array | all words allowed
165-
| even_symbols | Symbols that always need an even count | Char Array | []
166-
| matching_symbols | Symbols that map to another | Array of matching configurations: each configuration is an Array of two values: `["match", "match"]`. See example below. | []
167-
| max_word_count | Maximum number of words in a sentence | integer | 14
168-
| may_end_with_colon | If a sentence can end with a : or not | boolean | false
169-
| min_characters | Minimum of character occurrences | integer | 0
170-
| max_characters | Maximum of character occurrences | integer | MAX
171-
| min_trimmed_length | Minimum length of string after trimming | integer | 3
172-
| min_word_count | Minimum number of words in a sentence | integer | 1
173-
| needs_letter_start | If a sentence needs to start with a letter | boolean | true
174-
| needs_punctuation_end | If a sentence needs to end with a punctuation | boolean | false
175-
| needs_uppercase_start | If a sentence needs to start with an uppercase | boolean | false
176-
| other_patterns | Regex to disallow anything else | Rust Regex Array | all other patterns allowed
177-
| quote_start_with_letter | If a quote needs to start with a letter | boolean | true
178-
| remove_brackets_list | Removes (possibly nested) user defined brackets and content inside them `(anything [else])` from the sentence before replacements and checking other rules | Array of matching brackets: each configuration is an Array of two values: `["opening_bracket", "closing_bracket"]`. See example below. | []
179-
| replacements | Replaces abbreviations or other words according to configuration. This happens before any other rules are checked. | Array of replacement configurations: each configuration is an Array of two values: `["search", "replacement"]`. See example below. | nothing gets replaced
180-
| segmenter | Segmenter to use for this language. See below for more information. | "python" | using `rust-punkt` by default
181-
| stem_separator_regex | If given, splits words at the given characters to reach the stem words to check them again against the blacklist, e.g. prevents "Rust's" to pass if "Rust" is in the blacklist. | Simple regex of separators, e.g. for apostrophe `stem_separator_regex = "[']"` | ""
158+
| Name | Description | Values | Default |
159+
|-------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------|---------|
160+
| abbreviation_patterns | Regex defining abbreviations | Rust Regex Array | all abbreviations allowed
161+
| allowed_symbols_regex | Regex of allowed symbols or letters. Each character gets matched against this pattern. | String Array | not used
162+
| broken_whitespace | Array of broken whitespaces. This could for example disallow two spaces following each other | String Array | all types of whitespaces allowed
163+
| disallowed_symbols | Use `allowed_symbols_regex` instead. Array of disallowed symbols or letters. Only used when allowed_symbols_regex is not set or is an empty String. | String Array | all symbols allowed
164+
| disallowed_words | Array of disallowed words. Prefer the blocklist approach when possible. | String Array | all words allowed
165+
| even_symbols | Symbols that always need an even count | Char Array | []
166+
| matching_symbols | Symbols that map to another | Array of matching configurations: each configuration is an Array of two values: `["match", "match"]`. See example below. | []
167+
| max_word_count | Maximum number of words in a sentence | integer | 14
168+
| may_end_with_colon | If a sentence can end with a : or not | boolean | false
169+
| min_characters | Minimum of character occurrences | integer | 0
170+
| max_characters | Maximum of character occurrences | integer | MAX
171+
| min_trimmed_length | Minimum length of string after trimming | integer | 3
172+
| min_word_count | Minimum number of words in a sentence | integer | 1
173+
| needs_letter_start | If a sentence needs to start with a letter | boolean | true
174+
| needs_punctuation_end | If a sentence needs to end with a punctuation | boolean | false
175+
| needs_uppercase_start | If a sentence needs to start with an uppercase | boolean | false
176+
| other_patterns | Regex to disallow anything else | Rust Regex Array | all other patterns allowed
177+
| quote_start_with_letter | If a quote needs to start with a letter | boolean | true
178+
| remove_brackets_list | Removes (possibly nested) user defined brackets and content inside them `(anything [else])` from the sentence before replacements and checking other rules | Array of matching brackets: each configuration is an Array of two values: `["opening_bracket", "closing_bracket"]`. See example below. | []
179+
| replacements | Replaces abbreviations or other words according to configuration. This happens before any other rules are checked. | Array of replacement configurations: each configuration is an Array of two values: `["search", "replacement"]`. See example below. | nothing gets replaced
180+
| regex_replacement_list | Finds regex and makes replacements within found patterms. This happens before any other rules are checked. | Array of configurations: each configuration is an Array of three values: `["regex", "search", "replacement"]`. See example below. | nothing gets replaced
181+
| segmenter | Segmenter to use for this language. See below for more information. | "python" | using `rust-punkt` by default
182+
| stem_separator_regex | If given, splits words at the given characters to reach the stem words to check them again against the blacklist, e.g. prevents "Rust's" to pass if "Rust" is in the blacklist. | Simple regex of separators, e.g. for apostrophe `stem_separator_regex = "[']"` | ""
182183

183184
### Example for `matching_symbols`
184185

@@ -239,6 +240,29 @@ Input: I am foo test a test
239240
Output: I am hi a hi
240241
```
241242

243+
### Example for `regex_replacement_list`
244+
245+
```
246+
regex_replacement_list = [
247+
# Split glued sentences
248+
["\\ [a-z]{3,}\\.[A-Z][a-z]{2,}\\ ", ".", ". "],
249+
250+
# Split long sentences
251+
["\\b(?:\\S+\\s+){15,}\\S+[.!?]", ", but ", ". But "],
252+
]
253+
```
254+
255+
This will find words that glue two sentences and will add a space to un-glue them.
256+
And will split a long sentence in two smaller.
257+
258+
```
259+
Input: A sentence.Glued to another.
260+
Output: A sentence. Glued to another.
261+
262+
Input: A first part of a long sentence that would be rejected, but infact it could be used.
263+
Output: A first part of a long sentence that would be rejected. But infact it could be used.
264+
```
265+
242266
## Using disallowed words
243267

244268
In order to increase the quality of the final output, you might want to consider filtering out some words that are complex, too long or non-native.

src/replacer.rs

+29
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,19 @@ pub fn replace_strings(rules: &Rules, raw: &str) -> String {
2828
}
2929
}
3030

31+
// regex replacements
32+
for regex_replacement in rules.regex_replacement_list.iter() {
33+
if Value::as_array(regex_replacement).unwrap().len() == 3 {
34+
let regex = Regex::new(&regex_replacement[0].as_str().unwrap()).unwrap();
35+
let search = regex_replacement[1].as_str().unwrap();
36+
let replacement = regex_replacement[2].as_str().unwrap();
37+
38+
result = regex.replace_all(&result, |caps: &regex::Captures| {
39+
caps[0].replace(search, replacement)
40+
}).to_string();
41+
}
42+
}
43+
3144
result
3245
}
3346

@@ -168,4 +181,20 @@ mod test {
168181
assert_eq!(replace_strings(&rules, &String::from("Four: (content (and nested one)) should be removed.")), "Four: should be removed.");
169182
assert_eq!(replace_strings(&rules, &String::from("Five: (one) (two) and [three] 'and' should stay.")), "Five: and 'and' should stay.");
170183
}
184+
185+
#[test]
186+
fn test_regex_replacement() {
187+
let rules = Rules {
188+
regex_replacement_list: vec![
189+
Value::try_from([
190+
Value::try_from("\\ [a-z]{3,}\\.[A-Z][a-z]{2,}\\ ").unwrap(),
191+
Value::try_from(".").unwrap(),
192+
Value::try_from(". ").unwrap()
193+
]).unwrap(),
194+
],
195+
..Default::default()
196+
};
197+
198+
assert_eq!(replace_strings(&rules, &String::from("A sentence.Glued to another.")), "A sentence. Glued to another.");
199+
}
171200
}

src/rules.rs

+3
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ pub struct Rules {
5757
pub other_patterns: Array,
5858
pub stem_separator_regex: String,
5959
pub replacements: Array,
60+
pub regex_replacement_list: Array,
6061
pub even_symbols: Array,
6162
pub matching_symbols: Array,
6263
}
@@ -84,6 +85,7 @@ impl Default for Rules {
8485
other_patterns: vec![],
8586
stem_separator_regex: String::from(""),
8687
replacements: vec![],
88+
regex_replacement_list: vec![],
8789
even_symbols: vec![],
8890
matching_symbols: vec![],
8991
}
@@ -121,6 +123,7 @@ mod test {
121123
assert_eq!(rules.other_patterns, vec![]);
122124
assert_eq!(rules.stem_separator_regex, String::from(""));
123125
assert_eq!(rules.replacements, vec![]);
126+
assert_eq!(rules.regex_replacement_list, vec![]);
124127
assert_eq!(rules.even_symbols, vec![]);
125128
assert_eq!(rules.matching_symbols, vec![]);
126129
}

0 commit comments

Comments
 (0)