Skip to content

Commit 01420d7

Browse files
committed
OPENNLP-912: Rules based sentence detector
1 parent af6a6e0 commit 01420d7

File tree

11 files changed

+1674
-0
lines changed

11 files changed

+1674
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package opennlp.tools.sentdetect.segment;
19+
20+
public class Clean {
21+
22+
String regex;
23+
String replacement;
24+
25+
/**
26+
* @param regex the regular expression to which this string is to be matched
27+
* @param replacement the string to be substituted for each match
28+
*/
29+
public Clean(String regex, String replacement) {
30+
this.regex = regex;
31+
this.replacement = replacement;
32+
}
33+
34+
public String getRegex() {
35+
return regex;
36+
}
37+
38+
public String getReplacement() {
39+
return replacement;
40+
}
41+
42+
@Override
43+
public String toString() {
44+
return "Clean{" +
45+
"regex='" + regex + '\'' +
46+
", replacement='" + replacement + '\'' +
47+
'}';
48+
}
49+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package opennlp.tools.sentdetect.segment;
19+
20+
import java.util.ArrayList;
21+
import java.util.List;
22+
23+
/**
24+
* removes errant newlines, xhtml, inline formatting, etc.
25+
*/
26+
public class Cleaner {
27+
28+
public List<Clean> cleanList = new ArrayList<Clean>();
29+
30+
public String clean(String text) {
31+
for (Clean clean : cleanList) {
32+
text = text.replaceAll(clean.getRegex(), clean.getReplacement());
33+
}
34+
return text;
35+
}
36+
37+
public void clear() {
38+
if (cleanList != null) {
39+
cleanList.clear();
40+
}
41+
}
42+
43+
/**
44+
* TODO: Move rules into profiles
45+
*/
46+
public void rules() {
47+
48+
cleanList.add(new Clean("\\n(?=[a-zA-Z]{1,2}\\n)", ""));
49+
50+
cleanList.add(new Clean("\\n \\n", "\n"));
51+
52+
cleanList.add(new Clean("\\n\\n", "\n"));
53+
54+
cleanList.add(new Clean("\\n(?=\\.(\\s|\\n))", ""));
55+
cleanList.add(new Clean("(?<=\\s)\\n", ""));
56+
cleanList.add(new Clean("(?<=\\S)\\n(?=\\S)", " \n "));
57+
cleanList.add(new Clean("\\n", "\n"));
58+
cleanList.add(new Clean("\\\\n", "\n"));
59+
cleanList.add(new Clean("\\\\\\ n", "\n"));
60+
61+
cleanList.add(new Clean("\\{b\\^&gt;\\d*&lt;b\\^\\}|\\{b\\^>\\d*<b\\^\\}",""));
62+
63+
cleanList.add(new Clean("\\.{4,}\\s*\\d+-*\\d*","\r"));
64+
65+
// cleanList.add(new Clean("\\.{5,}", " "));
66+
cleanList.add(new Clean("\\/{3}", ""));
67+
68+
// cleanList.add(new Clean("(?<=[a-z])\\.(?=[A-Z])", ". "));
69+
// cleanList.add(new Clean("(?<=\\d)\\.(?=[A-Z])", ". "));
70+
71+
cleanList.add(new Clean("\\n(?=•')", "\r"));
72+
cleanList.add(new Clean("''", "\""));
73+
cleanList.add(new Clean("``", "\""));
74+
75+
}
76+
77+
public void html() {
78+
cleanList.add(new Clean("<\\/?\\w+((\\s+\\w+(\\s*=\\s*(?:\\\".*?\\\"|'.*?'|" +
79+
"[\\^'\\\">\\s]+))?)+\\s*|\\s*)\\/?>", ""));
80+
cleanList.add(new Clean("&lt;\\/?[^gt;]*gt;", ""));
81+
}
82+
83+
public void pdf() {
84+
cleanList.add(new Clean("(?<=[^\\n]\\s)\\n(?=\\S)", ""));
85+
cleanList.add(new Clean("\\n(?=[a-z])", " "));
86+
}
87+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package opennlp.tools.sentdetect.segment;
19+
20+
import java.util.ArrayList;
21+
22+
/**
23+
* TODO: Move rules into profiles
24+
*/
25+
public class EnglishRule {
26+
private static LanguageRule languageRule = new LanguageRule("eng", new ArrayList<Rule>());
27+
28+
public EnglishRule() {
29+
common();
30+
number();
31+
name();
32+
betweenPunctuation();
33+
list();
34+
}
35+
36+
public LanguageRule getLanguageRule() {
37+
return languageRule;
38+
}
39+
40+
41+
private void common() {
42+
43+
languageRule.addRule(new Rule(true, "\\n", ""));
44+
languageRule.addRule(new Rule(true, " ", "\\n"));
45+
46+
languageRule.addRule(new Rule(true, "[\\.\\?!]+\\s+", "[^\\.]"));
47+
48+
languageRule.addRule(new Rule(true, "[\\.\\?!]+", "\\s*(A |Being|Did|For|He|" +
49+
"How|However|I|In|It|Millions|More|She|That|The|There|They|We|What|When|Where|Who|Why)"));
50+
51+
languageRule.addRule(new Rule(true, "[!?\\.-][\\\"\\'“”]\\s+", "[A-Z]"));
52+
53+
languageRule.addRule(new Rule(true, "(?<=\\S)(!|\\?){3,}", "(?=(\\s|\\Z|$))"));
54+
55+
languageRule.addRule(new Rule(false, "[\\.\\?!]+\\s*", "(?=[\\.\\?!])"));
56+
57+
languageRule.addRule(new Rule(false, "([a-zA-z]°)\\.\\s*", "(?=\\d+)"));
58+
59+
languageRule.addRule(new Rule(false, "\\s", "(?=[a-z])"));
60+
}
61+
62+
private void number() {
63+
languageRule.addRule(new Rule(false, "\\d\\.", "(?=\\d)"));
64+
65+
}
66+
67+
private void name() {
68+
69+
languageRule.addRule(new Rule(false, "(Mr|Mrs|Ms|Dr|p.m|a.m|tel)\\.", "\\s*"));
70+
71+
languageRule.addRule(new Rule(true, "(P\\.M\\.|A\\.M\\.)", "\\s+"));
72+
73+
languageRule.addRule(new Rule(false, "(?<=(?<=^)[A-Z]\\.\\s+|(?<=\\A)[A-Z]\\.\\s+|" +
74+
"[A-Z]\\.\\s+|(?<=^)[A-Z][a-z]\\.\\s+|(?<=\\A)[A-Z][a-z]\\.\\s+|(?<=\\s)[A-Z]" +
75+
"[a-z]\\.\\s)", "(?!(A |Being|Did|For|He|How|However|I|In|It|Millions|" +
76+
"More|She|That|The|There|They|We|What|When|Where|Who|Why))"));
77+
}
78+
79+
private void betweenPunctuation() {
80+
81+
languageRule.addRule(new Rule(false, "(?<=\\s)'(?:[^']|'[a-zA-Z])*'", ""));
82+
83+
languageRule.addRule(new Rule(false, "(?<=\\s)‘(?:[^’]|’[a-zA-Z])*’", ""));
84+
85+
languageRule.addRule(new Rule(false, "\"(?>[^\"\\\\]+|\\\\{2}|\\\\.)*\"", ""));
86+
87+
languageRule.addRule(new Rule(false, "«(?>[^»\\\\]+|\\\\{2}|\\\\.)*»", ""));
88+
89+
languageRule.addRule(new Rule(false, "“(?>[^”\\\\]+|\\\\{2}|\\\\.)*”", ""));
90+
91+
languageRule.addRule(new Rule(false, "\\[(?>[^\\]\\\\]+|\\\\{2}|\\\\.)*\\]", ""));
92+
93+
languageRule.addRule(new Rule(false, "\\((?>[^\\(\\)\\\\]+|\\\\{2}|\\\\.)*\\)", ""));
94+
95+
languageRule.addRule(new Rule(false, "(?<=\\s)\\-\\-(?>[^\\-\\-])*\\-\\-", ""));
96+
}
97+
98+
private void list() {
99+
100+
languageRule.addRule(new Rule(false, "((?<=^)[a-z]\\.|(?<=\\A)[a-z]\\.|(?<=\\s)[a-z]\\.)",
101+
"\\s*(?!(A |Being|Did|For|He|How|However|I|In|It|Millions|More|She|That|The|There|" +
102+
"They|We|What|When|Where|Who|Why))"));
103+
104+
//number_list
105+
languageRule.addRule(new Rule(false, "(?<=\\s)\\d{1,2}\\.(\\s)|^\\d{1,2}\\.(\\s)|" +
106+
"(?<=\\s)\\d{1,2}\\.(\\))|^\\d{1,2}\\.(\\))|(?<=\\s\\-)\\d{1,2}\\.(\\s)|" +
107+
"(?<=^\\-)\\d{1,2}\\.(\\s)|(?<=\\s\\⁃)\\d{1,2}\\.(\\s)|(?<=^\\⁃)\\d{1,2}\\.(\\s)|" +
108+
"(?<=\\s\\-)\\d{1,2}\\.(\\))|(?<=^\\-)\\d{1,2}\\.(\\))|(?<=\\s\\⁃)\\d{1,2}\\.(\\))|" +
109+
"(?<=^\\⁃)\\d{1,2}\\.(\\))|(\\•)\\s*\\d{1,2}\\.(\\s)|(?<=\\s)\\d{1,2}(\\))", "\\s*"));
110+
111+
//number_list
112+
languageRule.addRule(new Rule(true, "", "\\s+((?<=\\s)\\d{1,2}\\.(?=\\s)|" +
113+
"^\\d{1,2}\\.(?=\\s)|(?<=\\s)\\d{1,2}\\.(?=\\))|^\\d{1,2}\\.(?=\\))|((?<=\\s)\\-)" +
114+
"\\d{1,2}\\.(?=\\s)|(^\\-)\\d{1,2}\\.(?=\\s)|((?<=\\s)\\⁃)\\d{1,2}\\.(?=\\s)|" +
115+
"(^\\⁃)\\d{1,2}\\.(?=\\s)|((?<=\\s)\\-)\\d{1,2}\\.(?=\\))|(^\\-)\\d{1,2}\\.(?=\\))|" +
116+
"((?<=\\s)\\⁃)\\d{1,2}\\.(?=\\))|(^\\⁃)\\d{1,2}\\.(?=\\))|(\\•)\\s*\\d{1,2}\\.(\\s)|" +
117+
"(?<=\\s)\\d{1,2}(?=\\)))"));
118+
}
119+
120+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package opennlp.tools.sentdetect.segment;
19+
20+
import java.util.ArrayList;
21+
import java.util.Collections;
22+
import java.util.List;
23+
24+
/**
25+
* Represents rule for segmenting text in some language. Contains {@link Rule}
26+
* list.
27+
*
28+
*/
29+
public class LanguageRule {
30+
31+
private List<Rule> ruleList;
32+
33+
private String name;
34+
35+
/**
36+
* Creates language rule.
37+
*
38+
* @param name language rule name
39+
* @param ruleList rule list (it will be shallow copied)
40+
*/
41+
public LanguageRule(String name, List<Rule> ruleList) {
42+
this.ruleList = new ArrayList<Rule>(ruleList);
43+
this.name = name;
44+
}
45+
46+
/**
47+
* Creates empty language rule.
48+
*
49+
* @param name language rule name
50+
*/
51+
public LanguageRule(String name) {
52+
this(name, new ArrayList<Rule>());
53+
}
54+
55+
/**
56+
* @return unmodifiable rules list
57+
*/
58+
public List<Rule> getRuleList() {
59+
return Collections.unmodifiableList(ruleList);
60+
}
61+
62+
/**
63+
* Adds rule to the end of rule list.
64+
* @param rule
65+
*/
66+
public void addRule(Rule rule) {
67+
ruleList.add(rule);
68+
}
69+
70+
/**
71+
* @return language rule name
72+
*/
73+
public String getName() {
74+
return name;
75+
}
76+
77+
}

0 commit comments

Comments
 (0)