Skip to content

Commit a5b777e

Browse files
committed
tokenize &entities; U.S.S.R.
1 parent 6c46d56 commit a5b777e

19 files changed

Lines changed: 539725 additions & 539658 deletions

File tree

12.2 KB
Binary file not shown.

alix-cli/lib/alix-common-1.0.0.jar

3.39 KB
Binary file not shown.

alix-cli/lib/alix-fr-1.0.0.jar

467 Bytes
Binary file not shown.

alix-cli/lib/alix-util-1.0.0.jar

28.5 KB
Binary file not shown.
1.67 MB
Binary file not shown.
4.23 MB
Binary file not shown.

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/FrDics.java

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ synchronized static public void load(final String name, final Reader reader, boo
284284
loaded.add(name);
285285
CSVReader csv = null;
286286
try {
287-
csv = new CSVReader(reader, 4);
287+
csv = new CSVReader(reader, 4, ',');
288288
csv.readRow(); // skip first line
289289
Row row;
290290
while ((row = csv.readRow()) != null) {
@@ -301,20 +301,10 @@ synchronized static public void load(final String name, final Reader reader, boo
301301
decompose(graph, TREELOC);
302302
}
303303
// known abbreviation with at least one final dot, add the compounds
304-
// do not handle multi word abbreviation like "av. J.-C."
304+
// do not handle here multi word abbreviation like "av. J.-C."
305305
Chain norm = row.get(NORM);
306306
if (!hasSpace && graph.last() == '.') {
307-
// if multiple dots like U.S.A., add U., U.S., and U.S.A.
308-
for (int length = 2; length <= graph.length() ; length++) {
309-
if (graph.charAt(length - 1) != '.') continue;
310-
CharsAttImpl key = new CharsAttImpl(graph, 0, length);
311-
BREVIDOT.add(key);
312-
}
313-
if (!norm.isEmpty()) {
314-
NORMALIZE.put(new CharsAttImpl(graph), new CharsAttImpl(norm));
315-
}
316-
// do not add brevidots to dico ?
317-
continue;
307+
BREVIDOT.add(new CharsAttImpl(graph));
318308
}
319309
// check if it is normalization
320310
if (!norm.isEmpty()) {
@@ -388,6 +378,21 @@ public static boolean norm(CharsAtt att)
388378
att.setEmpty().append(val);
389379
return true;
390380
}
381+
382+
/**
383+
* Get normalized orthographic form for a real grapphical form in text.
384+
*
385+
* @param test {@link CharAtt} implementation, normalized.
386+
* @return true if a normalization has been done, false otherwise.
387+
*/
388+
public static boolean norm(final CharsAtt test, final CharTermAttribute dst)
389+
{
390+
CharsAtt val = NORMALIZE.get(test);
391+
if (val == null)
392+
return false;
393+
dst.setEmpty().append(val);
394+
return true;
395+
}
391396

392397
/**
393398
* Get a dictionary entry from the word dictionary

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/TokenizerML.java

Lines changed: 58 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package com.github.oeuvres.alix.lucene.analysis;
22

33
import java.io.IOException;
4+
import java.util.Map;
5+
import java.util.Set;
46

57
import org.apache.lucene.analysis.CharacterUtils;
68
import org.apache.lucene.analysis.Tokenizer;
@@ -88,14 +90,20 @@ public class TokenizerML extends Tokenizer
8890
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
8991
/** Used as char[] wrapper for testing */
9092
private final CharsAttImpl test = new CharsAttImpl();
91-
/** Buffer of chars */
92-
private final CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(8192);
93+
/** Buffer of chars, give a big size avoiding pb for */
94+
private final CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(2 * 1024 * 1024);
9395
/** Position in buffer */
9496
private int bufferIndex = 0;
9597
/** size of buffer*/
9698
private int bufferLen = 0;
9799
/** current char offset */
98100
private int offset = 0;
101+
/** XML entities */
102+
static final Map<String, String> XML_ENT = Map.ofEntries(
103+
Map.entry("gt", ">"),
104+
Map.entry("lt", "<"),
105+
Map.entry("amp", "&")
106+
);
99107

100108
/**
101109
* Build a Tokenizer for Markup tagged text.
@@ -111,7 +119,7 @@ public final boolean incrementToken() throws IOException
111119
// flags
112120
boolean intag = false;
113121
boolean number = false;
114-
122+
int amp = -1; // check XML entities
115123
// Mandatory start of a term
116124
int startOffset = -1;
117125
char lastChar;
@@ -136,12 +144,12 @@ public final boolean incrementToken() throws IOException
136144
}
137145
// if no luck, a try to go back in buffer can fall in negative
138146
if (bufferIndex < 0) {
139-
// System.out.println(buffer.getBuffer());
140147
bufferIndex = 0;
141148
}
142149
lastChar = c;
143150
c = buffer.getBuffer()[bufferIndex];
144-
// default, go next
151+
// record an event for entities
152+
if (c == '&') amp = termAtt.length();
145153

146154
// start of a tag, do not advance cursors
147155
if (c == '<') {
@@ -216,9 +224,24 @@ else if (Char.isToken(c)) {
216224
break; // a too big token stop
217225
}
218226
}
219-
// possible entity
220-
else if (c == ';' && termAtt.charAt(0) == '&') {
221-
// TODO !
227+
// xml entity, handle case like -&gt;
228+
else if (c == ';' && amp >= 0) {
229+
termAtt.append(c);
230+
final int lim = termAtt.length() - 2 - amp;
231+
for (var entry : XML_ENT.entrySet()) {
232+
String key = entry.getKey();
233+
if (key.length() != lim) continue;
234+
int pos = 0;
235+
for (; pos < lim; pos++) {
236+
if (termAtt.charAt(pos + amp + 1) != key.charAt(pos)) break;
237+
}
238+
// entity seems found here
239+
if (pos == lim) {
240+
termAtt.setLength(amp).append(entry.getValue());
241+
break;
242+
}
243+
}
244+
amp = -1;
222245
}
223246
// Clause punctuation, send a punctuation event to separate tokens
224247
else if (',' == c || ';' == c || ':' == c || '(' == c || ')' == c || '—' == c || '–' == c
@@ -236,17 +259,12 @@ else if (Char.isToken(c)) {
236259
offset++;
237260
break;
238261
}
239-
// abbreviation ?
262+
// complex case, the dot and abbreviations, append and let next filter define what to do
240263
else if (c == '.' && Char.isLetter(lastChar) ) {
241264
termAtt.append(c);
242-
// not an abbreviaiton, send without dot
243-
if (!FrDics.isBrevidot( test.wrap(termAtt.buffer(), termAtt.length()) )) {
244-
termAtt.setLength(termAtt.length() - 1);
245-
break;
246-
}
247265
}
248266
// Possible sentence delimiters
249-
else if (c == '.' || c == '…' || c == '?' || c == '!' ) {
267+
else if ( c == '.' || c == '…' || c == '?' || c == '!' ) {
250268
// if pending word, send, and come back later
251269
if (!termAtt.isEmpty() && lastChar != '.' && lastChar != '?' && lastChar != '!') {
252270
break;
@@ -258,13 +276,37 @@ else if (c == '.' || c == '…' || c == '?' || c == '!' ) {
258276
}
259277
termAtt.append(c);
260278
}
261-
// not token char, token to send
279+
// not token char, token to send ?
262280
else if (!termAtt.isEmpty()) {
263281
break;
264282
}
265283
bufferIndex++;
266284
offset++;
267285
}
286+
// final dot special case
287+
int len = termAtt.length();
288+
if (Char.isLetter(termAtt.charAt(0)) && termAtt.charAt(len - 1) == '.') {
289+
test.wrap(termAtt.buffer(), termAtt.length());
290+
if (FrDics.isBrevidot(test) ) {
291+
// maybe normalize now
292+
FrDics.norm(test, termAtt);
293+
}
294+
// one letter, abbreviation
295+
else if (termAtt.length() == 2) {
296+
297+
}
298+
// go back in buffer to restart from first point, remember the "..." case
299+
else {
300+
while (termAtt.charAt(len - 1) == '.') {
301+
len--;
302+
bufferIndex--;
303+
offset--;
304+
endOffset = offset;
305+
}
306+
termAtt.setLength(len);
307+
}
308+
}
309+
268310
// here, a term should be ready, send it
269311
posIncAtt.setPositionIncrement(1);
270312
posLenAtt.setPositionLength(1);

analysis/src/java/com/github/oeuvres/alix/lucene/index/Analyze4vec.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,10 @@ else if (flags == PUNsent.code) {
100100
else if (PUN.isPun(flags)) {
101101
continue;
102102
}
103+
// unknown
104+
else if (flags == TOKEN.code()) {
105+
continue;
106+
}
103107
else {
104108
char[] chars = termAtt.buffer();
105109
final int len = termAtt.length();

analysis/src/java/com/github/oeuvres/alix/lucene/index/ListErrors.java

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -64,14 +64,17 @@ public Integer call() throws Exception
6464
}
6565
}
6666
else {
67-
String text = "Mais cela ne signifie naturellement pas qu’il sache d’emblée composer les dépassements entre eux (Δ<hi>xz</hi> = Δ<hi>xy</hi> + Δ<hi>yz</hi>) et, comme on le verra sous 2), il est au contraire probable qu’aux débuts un plus grand dépassement, et même un dépassement égal mais entre éléments plus grands, leur paraissent d’une autre nature qu’un dépassement entre petits éléments.";
67+
String text = "Mais cela ne signifie naturellement pas qu’il sache d’emblée composer les dépassements &gt; entre eux (Δ<hi>xz</hi> = Δ<hi>xy</hi> + Δ<hi>yz</hi>) &amp;, comme on le verra sous 2), il est au contraire probable qu’aux débuts un plus grand dépassement, et même un dépassement égal mais entre éléments plus grands, leur paraissent d’une autre nature qu’un dépassement entre petits éléments.";
6868
analyze(analyzer.tokenStream("", new StringReader(text)));
6969
}
7070

7171

7272
analyzer.close();
7373
Top<Chain> top = new Top<Chain>(Chain.class, 2000);
7474
for (Entry<Chain, IntMutable> entry: errors.entrySet()) {
75+
if (entry.getKey().equals("Ad*")) {
76+
System.out.println(entry);
77+
}
7578
top.insert(entry.getValue().value(), entry.getKey());
7679
}
7780

@@ -113,8 +116,8 @@ private void analyze(final TokenStream tokenStream) throws IOException
113116
continue;
114117
}
115118
charsAtt.wrap(termAtt.buffer(), termAtt.length());
119+
FrDics.norm(charsAtt);
116120
if (Char.isUpperCase(charsAtt.charAt(0))) {
117-
FrDics.norm(charsAtt);
118121
if (FrDics.name(charsAtt) != null) {
119122
up();
120123
continue;
@@ -126,11 +129,28 @@ private void analyze(final TokenStream tokenStream) throws IOException
126129
continue;
127130
}
128131
charsAtt.toLower();
132+
if (FrDics.word(charsAtt) != null) {
133+
up();
134+
continue;
135+
}
136+
// candidate name, let it
137+
// charsAtt.capitalize();
138+
up();
139+
continue;
140+
}
141+
else if (FrDics.word(charsAtt) != null) {
142+
up();
143+
continue;
129144
}
130-
if (FrDics.word(charsAtt) != null) {
145+
// variables
146+
if (
147+
charsAtt.length() == 1
148+
|| (charsAtt.length() == 2 && (charsAtt.charAt(1) == '\'' || charsAtt.charAt(1) == '.' || Char.isDigit(charsAtt.charAt(1))))
149+
) {
131150
up();
132151
continue;
133152
}
153+
134154
if (!form.isEmpty()) form.append(' ');
135155
form.append(termAtt);
136156
}
@@ -173,13 +193,8 @@ public TokenStreamComponents createComponents(String field)
173193
*/
174194
public static void main(String[] args) throws Exception
175195
{
176-
/*
177196
int exitCode = new CommandLine(new ListErrors()).execute(args);
178197
System.exit(exitCode);
179-
*/
180-
CharTermAttribute term = new CharTermAttributeImpl();
181-
term.append("test");
182-
System.out.println(term.equals("test"));
183198
}
184199

185200
}

0 commit comments

Comments
 (0)