|
17 | 17 |
|
18 | 18 | package opennlp.tools.sentdetect.segment;
|
19 | 19 |
|
20 |
| -import java.io.IOException; |
21 |
| -import java.io.Reader; |
22 |
| -import java.util.ArrayList; |
23 | 20 | import java.util.List;
|
24 |
| -import java.util.Set; |
25 |
| -import java.util.TreeSet; |
26 |
| -import java.util.regex.Matcher; |
27 |
| -import java.util.regex.Pattern; |
28 | 21 |
|
29 |
| -import opennlp.tools.util.StringUtil; |
30 |
| - |
31 |
| - |
32 |
| -public class SentenceTokenizer { |
33 |
| - |
34 |
| - private String sentence; |
35 |
| - |
36 |
| - private int start; |
37 |
| - |
38 |
| - private int end; |
39 |
| - |
40 |
| - private CharSequence text; |
41 |
| - |
42 |
| - private Reader reader; |
43 |
| - |
44 |
| - private int bufferLength; |
45 |
| - |
46 |
| - private LanguageTool languageTool; |
47 |
| - |
48 |
| - private Matcher beforeMatcher; |
49 |
| - |
50 |
| - private Matcher afterMatcher; |
51 |
| - |
52 |
| - boolean found; |
53 |
| - |
54 |
| - private Set<Integer> breakSections; |
55 |
| - |
56 |
| - private List<Section> noBreakSections; |
57 |
| - |
58 |
| - public SentenceTokenizer(LanguageTool languageTool, CharSequence text) { |
59 |
| - this.text = text; |
60 |
| - this.reader = null; |
61 |
| - this.bufferLength = text.length(); |
62 |
| - this.languageTool = languageTool; |
63 |
| - this.sentence = null; |
64 |
| - this.start = 0; |
65 |
| - this.end = 0; |
66 |
| - } |
67 |
| - |
68 |
| - public SentenceTokenizer(LanguageTool languageTool, Reader reader, int bufferLength) { |
69 |
| - if (bufferLength <= 0) { |
70 |
| - throw new IllegalArgumentException("Buffer size: " + bufferLength + |
71 |
| - " must be positive."); |
72 |
| - } |
73 |
| - this.text = null; |
74 |
| - this.reader = reader; |
75 |
| - this.bufferLength = bufferLength; |
76 |
| - this.languageTool = languageTool; |
77 |
| - this.sentence = null; |
78 |
| - this.start = 0; |
79 |
| - this.end = 0; |
80 |
| - } |
81 |
| - |
82 |
| - public List<String> sentenceTokenizer() { |
83 |
| - |
84 |
| - List<String> sentenceList = new ArrayList<>(); |
85 |
| - CharSequence text = getText(); |
86 |
| - if (breakSections == null) { |
87 |
| - getBreakSections(); |
88 |
| - } |
89 |
| - for (Integer breakSection : breakSections) { |
90 |
| - if (breakSection == 0) { |
91 |
| - continue; |
92 |
| - } |
93 |
| - if (breakSection >= text.length()) { |
94 |
| - break; |
95 |
| - } |
96 |
| - end = breakSection; |
97 |
| - if (!isBreak()) { |
98 |
| - continue; |
99 |
| - } |
100 |
| - sentence = text.subSequence(start, end).toString(); |
101 |
| - start = end; |
102 |
| - |
103 |
| - sentence = removeSpace(sentence); |
104 |
| - if (sentence != null) { |
105 |
| - sentenceList.add(sentence); |
106 |
| - } |
107 |
| - } |
108 |
| - if (end < text.length()) { |
109 |
| - end = text.length(); |
110 |
| - sentence = text.subSequence(start, end).toString(); |
111 |
| - sentence = removeSpace(sentence); |
112 |
| - if (sentence != null) { |
113 |
| - sentenceList.add(sentence); |
114 |
| - } |
115 |
| - } |
116 |
| - return sentenceList; |
117 |
| - } |
118 |
| - |
119 |
| - public String removeSpace(String segment) { |
120 |
| - if (segment != null) { |
121 |
| - int first = 0; |
122 |
| - int last = segment.length(); |
123 |
| - while (first < segment.length() && StringUtil.isWhitespace(segment.charAt(first))) { |
124 |
| - first++; |
125 |
| - } |
126 |
| - while (last > 0 && StringUtil.isWhitespace(segment.charAt(last - 1))) { |
127 |
| - last--; |
128 |
| - } |
129 |
| - if (last - first > 0) { |
130 |
| - return segment.substring(first, last); |
131 |
| - } |
132 |
| - } |
133 |
| - return null; |
134 |
| - } |
135 |
| - |
136 |
| - public Set<Integer> getBreakSections() { |
137 |
| - if (breakSections == null) { |
138 |
| - breakSections = new TreeSet<Integer>(); |
139 |
| - for (Rule rule : languageTool.getBreakRuleList()) { |
140 |
| - |
141 |
| - Pattern beforePattern = languageTool.compile(rule.getBeforePattern()); |
142 |
| - Pattern afterPattern = languageTool.compile(rule.getAfterPattern()); |
143 |
| - this.beforeMatcher = beforePattern.matcher(text); |
144 |
| - this.afterMatcher = afterPattern.matcher(text); |
145 |
| - this.found = true; |
146 |
| - while (find()) { |
147 |
| - breakSections.add(getBreakPosition()); |
148 |
| - } |
149 |
| - } |
150 |
| - } |
151 |
| - return breakSections; |
152 |
| - } |
153 |
| - |
154 |
| - private boolean find() { |
155 |
| - found = false; |
156 |
| - while ((!found) && beforeMatcher.find()) { |
157 |
| - afterMatcher.region(beforeMatcher.end(), text.length()); |
158 |
| - found = afterMatcher.lookingAt(); |
159 |
| - } |
160 |
| - return found; |
161 |
| - } |
162 |
| - |
163 |
| - private int getBreakPosition() { |
164 |
| - return afterMatcher.start(); |
165 |
| - } |
166 |
| - |
167 |
| - public List<Section> getNoBreakSections() { |
168 |
| - if (noBreakSections == null) { |
169 |
| - noBreakSections = new ArrayList<Section>(); |
170 |
| - Pattern pattern = languageTool.getNoBreakPattern(); |
171 |
| - Matcher matcher = pattern.matcher(getText()); |
172 |
| - while (matcher.find()) { |
173 |
| - noBreakSections.add(new Section(matcher.start(), matcher.end())); |
174 |
| - } |
175 |
| - } |
176 |
| - return noBreakSections; |
177 |
| - } |
178 |
| - |
179 |
| - public CharSequence getText() { |
180 |
| - if (text == null) { |
181 |
| - text = read(bufferLength + 1); |
182 |
| - } |
183 |
| - return text; |
184 |
| - } |
185 |
| - |
186 |
| - private String read(int amount) { |
187 |
| - char[] charBuffer = new char[amount]; |
188 |
| - int count = read(reader, charBuffer); |
189 |
| - |
190 |
| - String result; |
191 |
| - if (count == amount) { |
192 |
| - result = new String(charBuffer, 0, count - 1); |
193 |
| - } else if (count > 0 && count < amount) { |
194 |
| - result = new String(charBuffer, 0, count); |
195 |
| - } else { |
196 |
| - result = ""; |
197 |
| - } |
198 |
| - |
199 |
| - return result; |
200 |
| - } |
201 |
| - |
202 |
| - private int read(Reader reader, char[] buffer) { |
203 |
| - |
204 |
| - int start = 0; |
205 |
| - int count; |
206 |
| - |
207 |
| - try { |
208 |
| - while (true) { |
209 |
| - if (!(((count = reader.read(buffer, start, buffer.length - start)) != -1) |
210 |
| - && start < buffer.length)) { |
211 |
| - break; |
212 |
| - } |
213 |
| - start += count; |
214 |
| - } |
215 |
| - } catch (IOException e) { |
216 |
| - e.printStackTrace(); |
217 |
| - } |
218 |
| - return start; |
219 |
| - } |
220 |
| - |
221 |
| - private boolean isBreak() { |
222 |
| - if (noBreakSections == null) { |
223 |
| - getNoBreakSections(); |
224 |
| - } |
225 |
| - if (noBreakSections != null && noBreakSections.size() > 0) { |
226 |
| - for (Section section : noBreakSections) { |
227 |
| - if (end >= section.getLeft() && end <= section.getRight()) { |
228 |
| - return false; |
229 |
| - } |
230 |
| - } |
231 |
| - } |
232 |
| - return true; |
233 |
| - } |
| 22 | +/** |
| 23 | + * The interface for rule based sentence detector |
| 24 | + */ |
| 25 | +public interface SentenceTokenizer { |
234 | 26 |
|
| 27 | + public List<String> sentenceTokenizer(); |
235 | 28 | }
|
0 commit comments