Skip to content

Commit ae25942

Browse files
committed
OPENNLP-912: Move rules into rules.xml
1 parent 01420d7 commit ae25942

File tree

6 files changed

+466
-334
lines changed

6 files changed

+466
-334
lines changed

Diff for: opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/EnglishRule.java

-120
This file was deleted.

Diff for: opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/SentenceTokenizer.java

+5-212
Original file line numberDiff line numberDiff line change
@@ -17,219 +17,12 @@
1717

1818
package opennlp.tools.sentdetect.segment;
1919

20-
import java.io.IOException;
21-
import java.io.Reader;
22-
import java.util.ArrayList;
2320
import java.util.List;
24-
import java.util.Set;
25-
import java.util.TreeSet;
26-
import java.util.regex.Matcher;
27-
import java.util.regex.Pattern;
2821

29-
import opennlp.tools.util.StringUtil;
30-
31-
32-
public class SentenceTokenizer {
33-
34-
private String sentence;
35-
36-
private int start;
37-
38-
private int end;
39-
40-
private CharSequence text;
41-
42-
private Reader reader;
43-
44-
private int bufferLength;
45-
46-
private LanguageTool languageTool;
47-
48-
private Matcher beforeMatcher;
49-
50-
private Matcher afterMatcher;
51-
52-
boolean found;
53-
54-
private Set<Integer> breakSections;
55-
56-
private List<Section> noBreakSections;
57-
58-
public SentenceTokenizer(LanguageTool languageTool, CharSequence text) {
59-
this.text = text;
60-
this.reader = null;
61-
this.bufferLength = text.length();
62-
this.languageTool = languageTool;
63-
this.sentence = null;
64-
this.start = 0;
65-
this.end = 0;
66-
}
67-
68-
public SentenceTokenizer(LanguageTool languageTool, Reader reader, int bufferLength) {
69-
if (bufferLength <= 0) {
70-
throw new IllegalArgumentException("Buffer size: " + bufferLength +
71-
" must be positive.");
72-
}
73-
this.text = null;
74-
this.reader = reader;
75-
this.bufferLength = bufferLength;
76-
this.languageTool = languageTool;
77-
this.sentence = null;
78-
this.start = 0;
79-
this.end = 0;
80-
}
81-
82-
public List<String> sentenceTokenizer() {
83-
84-
List<String> sentenceList = new ArrayList<>();
85-
CharSequence text = getText();
86-
if (breakSections == null) {
87-
getBreakSections();
88-
}
89-
for (Integer breakSection : breakSections) {
90-
if (breakSection == 0) {
91-
continue;
92-
}
93-
if (breakSection >= text.length()) {
94-
break;
95-
}
96-
end = breakSection;
97-
if (!isBreak()) {
98-
continue;
99-
}
100-
sentence = text.subSequence(start, end).toString();
101-
start = end;
102-
103-
sentence = removeSpace(sentence);
104-
if (sentence != null) {
105-
sentenceList.add(sentence);
106-
}
107-
}
108-
if (end < text.length()) {
109-
end = text.length();
110-
sentence = text.subSequence(start, end).toString();
111-
sentence = removeSpace(sentence);
112-
if (sentence != null) {
113-
sentenceList.add(sentence);
114-
}
115-
}
116-
return sentenceList;
117-
}
118-
119-
public String removeSpace(String segment) {
120-
if (segment != null) {
121-
int first = 0;
122-
int last = segment.length();
123-
while (first < segment.length() && StringUtil.isWhitespace(segment.charAt(first))) {
124-
first++;
125-
}
126-
while (last > 0 && StringUtil.isWhitespace(segment.charAt(last - 1))) {
127-
last--;
128-
}
129-
if (last - first > 0) {
130-
return segment.substring(first, last);
131-
}
132-
}
133-
return null;
134-
}
135-
136-
public Set<Integer> getBreakSections() {
137-
if (breakSections == null) {
138-
breakSections = new TreeSet<Integer>();
139-
for (Rule rule : languageTool.getBreakRuleList()) {
140-
141-
Pattern beforePattern = languageTool.compile(rule.getBeforePattern());
142-
Pattern afterPattern = languageTool.compile(rule.getAfterPattern());
143-
this.beforeMatcher = beforePattern.matcher(text);
144-
this.afterMatcher = afterPattern.matcher(text);
145-
this.found = true;
146-
while (find()) {
147-
breakSections.add(getBreakPosition());
148-
}
149-
}
150-
}
151-
return breakSections;
152-
}
153-
154-
private boolean find() {
155-
found = false;
156-
while ((!found) && beforeMatcher.find()) {
157-
afterMatcher.region(beforeMatcher.end(), text.length());
158-
found = afterMatcher.lookingAt();
159-
}
160-
return found;
161-
}
162-
163-
private int getBreakPosition() {
164-
return afterMatcher.start();
165-
}
166-
167-
public List<Section> getNoBreakSections() {
168-
if (noBreakSections == null) {
169-
noBreakSections = new ArrayList<Section>();
170-
Pattern pattern = languageTool.getNoBreakPattern();
171-
Matcher matcher = pattern.matcher(getText());
172-
while (matcher.find()) {
173-
noBreakSections.add(new Section(matcher.start(), matcher.end()));
174-
}
175-
}
176-
return noBreakSections;
177-
}
178-
179-
public CharSequence getText() {
180-
if (text == null) {
181-
text = read(bufferLength + 1);
182-
}
183-
return text;
184-
}
185-
186-
private String read(int amount) {
187-
char[] charBuffer = new char[amount];
188-
int count = read(reader, charBuffer);
189-
190-
String result;
191-
if (count == amount) {
192-
result = new String(charBuffer, 0, count - 1);
193-
} else if (count > 0 && count < amount) {
194-
result = new String(charBuffer, 0, count);
195-
} else {
196-
result = "";
197-
}
198-
199-
return result;
200-
}
201-
202-
private int read(Reader reader, char[] buffer) {
203-
204-
int start = 0;
205-
int count;
206-
207-
try {
208-
while (true) {
209-
if (!(((count = reader.read(buffer, start, buffer.length - start)) != -1)
210-
&& start < buffer.length)) {
211-
break;
212-
}
213-
start += count;
214-
}
215-
} catch (IOException e) {
216-
e.printStackTrace();
217-
}
218-
return start;
219-
}
220-
221-
private boolean isBreak() {
222-
if (noBreakSections == null) {
223-
getNoBreakSections();
224-
}
225-
if (noBreakSections != null && noBreakSections.size() > 0) {
226-
for (Section section : noBreakSections) {
227-
if (end >= section.getLeft() && end <= section.getRight()) {
228-
return false;
229-
}
230-
}
231-
}
232-
return true;
233-
}
22+
/**
23+
* The interface for rule based sentence detector
24+
*/
25+
public interface SentenceTokenizer {
23426

27+
public List<String> sentenceTokenizer();
23528
}

0 commit comments

Comments
 (0)