Skip to content

Commit 1a090cf

Browse files
committed
Tokenizer OK (with hyphens and apos)
1 parent 8f23951 commit 1a090cf

6 files changed

Lines changed: 174 additions & 128 deletions

File tree

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/AttLinkedList.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,10 @@ public void addLast(
173173
ensureLast();
174174
last.copy(buffer, copyOffset, copyLength, startOffset, endOffset);
175175
}
176+
177+
public void clear() {
178+
first = last = null;
179+
}
176180

177181
/**
178182
* Like {@link AbstractCollection#isEmpty()}.
Lines changed: 118 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,35 @@
1+
/*
2+
* Alix, A Lucene Indexer for XML documents.
3+
*
4+
* Copyright 2016 Frédéric Glorieux <frederic.glorieux@fictif.org>
5+
* Copyright 2009 Pierre Dittgen <pierre@dittgen.org>
6+
* Frédéric Glorieux <frederic.glorieux@fictif.org>
7+
*
8+
* Alix is a java library to index and search XML text documents
9+
* with Lucene https://lucene.apache.org/core/
10+
* including linguistic tools for French,
11+
* available under Apache license.
12+
*
13+
* Alix has been started in 2009 under the javacrim project
14+
* https://sf.net/projects/javacrim/
15+
* for a java course at Inalco http://www.er-tim.fr/
16+
* Alix continues the concepts of SDX under another licence
17+
* «Système de Documentation XML»
18+
* 2000-2010 Ministère de la culture et de la communication (France), AJLSM.
19+
* http://savannah.nongnu.org/projects/sdx/
20+
*
21+
* Licensed under the Apache License, Version 2.0 (the "License");
22+
* you may not use this file except in compliance with the License.
23+
* You may obtain a copy of the License at
24+
*
25+
* http://www.apache.org/licenses/LICENSE-2.0
26+
*
27+
* Unless required by applicable law or agreed to in writing, software
28+
* distributed under the License is distributed on an "AS IS" BASIS,
29+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
30+
* See the License for the specific language governing permissions and
31+
* limitations under the License.
32+
*/
133
package com.github.oeuvres.alix.lucene.analysis;
234

335
import java.io.IOException;
@@ -10,16 +42,21 @@
1042
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
1143

1244
import static com.github.oeuvres.alix.common.Upos.*;
45+
1346
/**
1447
* A filter that decomposes words on a list of suffixes and prefixes, mainly to handle
1548
* hyphenation and apostrophe ellision in French. The original token is broken and lost,
1649
* offset are precisely kept, so that word counting and stats are not biased by multiple
1750
* words on same positions.
1851
*
52+
* https://fr.wikipedia.org/wiki/Emploi_du_trait_d%27union_pour_les_pr%C3%A9fixes_en_fran%C3%A7ais
53+
*
1954
* Known side effect : qu’en-dira-t-on, donne-m’en, emmène-m’y.
2055
*/
2156
public class FilterAposHyphenFr extends TokenFilter
2257
{
58+
private static final int MAX_STEPS = 16;
59+
2360
/** The term provided by the Tokenizer */
2461
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
2562
/** Char index in source text. */
@@ -28,23 +65,21 @@ public class FilterAposHyphenFr extends TokenFilter
2865
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
2966
/** Stack of stored states */
3067
private final AttLinkedList deque = new AttLinkedList();
31-
32-
68+
3369
/** Ellisions prefix */
34-
static CharArrayMap<char[]> PREFIX = new CharArrayMap<>(30, false);
35-
static { // ellisions
70+
private static final CharArrayMap<char[]> PREFIX = new CharArrayMap<>(30, false);
71+
static {
3672
PREFIX.put("d'", "de".toCharArray());
37-
PREFIX.put("d'", "de".toCharArray()); // keep ' for locution, like d’abord
3873
PREFIX.put("D'", "de".toCharArray());
39-
PREFIX.put("j'", "je".toCharArray()); // j’aime.
74+
PREFIX.put("j'", "je".toCharArray());
4075
PREFIX.put("J'", "je".toCharArray());
4176
PREFIX.put("jusqu'", "jusque".toCharArray());
4277
PREFIX.put("Jusqu'", "jusque".toCharArray());
4378
PREFIX.put("l'", "l'".toCharArray()); // je l’aime. le ou la
4479
PREFIX.put("L'", "l'".toCharArray());
4580
PREFIX.put("lorsqu'", "lorsque".toCharArray());
4681
PREFIX.put("Lorsqu'", "lorsque".toCharArray());
47-
PREFIX.put("m'", "me".toCharArray()); // il m’aime.
82+
PREFIX.put("m'", "me".toCharArray());
4883
PREFIX.put("M'", "me".toCharArray());
4984
PREFIX.put("n'", "ne".toCharArray()); // N’y va pas.
5085
PREFIX.put("N'", "ne".toCharArray());
@@ -56,15 +91,16 @@ public class FilterAposHyphenFr extends TokenFilter
5691
PREFIX.put("Quelqu'", "quelque".toCharArray());
5792
PREFIX.put("quoiqu'", "quoique".toCharArray());
5893
PREFIX.put("Quoiqu'", "quoique".toCharArray());
59-
PREFIX.put("s'", "se".toCharArray()); // il s’aime.
94+
PREFIX.put("s'", "se".toCharArray());
6095
PREFIX.put("S'", "se".toCharArray());
61-
PREFIX.put("t'", "te".toCharArray()); // il t’aime.
96+
PREFIX.put("t'", "te".toCharArray());
6297
PREFIX.put("T'", "te".toCharArray());
6398
}
64-
// https://fr.wikipedia.org/wiki/Emploi_du_trait_d%27union_pour_les_pr%C3%A9fixes_en_fran%C3%A7ais
99+
65100
/** Hyphen suffixes */
66-
static final CharArrayMap<char[]> SUFFIX = new CharArrayMap<>(30, false);
101+
private static final CharArrayMap<char[]> SUFFIX = new CharArrayMap<>(30, false);
67102
static {
103+
68104
SUFFIX.put("-ce", "ce".toCharArray()); // Serait-ce ?
69105
SUFFIX.put("-ci", null); // cette année-ci, ceux-ci.
70106
SUFFIX.put("-elle", "elle".toCharArray()); // dit-elle.
@@ -92,124 +128,123 @@ public class FilterAposHyphenFr extends TokenFilter
92128
SUFFIX.put("-y", "y".toCharArray()); // allons-y.
93129
}
94130

95-
96-
97-
/**
98-
* Default constructor.
99-
* @param input previous filter.
100-
*/
101131
public FilterAposHyphenFr(TokenStream input) {
102132
super(input);
103133
}
104134

105135
@Override
106136
public final boolean incrementToken() throws IOException
107137
{
108-
// check if a term has been stored from last call
138+
// Emit buffered tokens first
109139
if (!deque.isEmpty()) {
110140
deque.removeFirst(termAtt, offsetAtt);
111141
}
112142
else {
113-
if (!input.incrementToken()) {
114-
// end of stream
115-
return false;
116-
}
143+
if (!input.incrementToken()) return false;
117144
}
145+
118146
// do not try to split in XML tags
119147
if (flagsAtt.getFlags() == XML.code) {
120148
return true;
121149
}
122-
int loop = 0;
123-
while (true) {
124-
if (++loop > 10) {
125-
throw new IOException("AposHyph décon: " + deque);
126-
}
127-
char[] chars = termAtt.buffer();
128-
int hyphLast = termAtt.length() - 1;
129-
for (; hyphLast >= 0; hyphLast--) {
130-
if ('-' == chars[hyphLast]) break;
131-
}
132-
int aposFirst = 0;
133-
for (; aposFirst < termAtt.length(); aposFirst++) {
134-
if (chars[aposFirst] == '’') chars[aposFirst] = '\'';
135-
if ('\'' == chars[aposFirst]) break;
136-
}
137-
if (aposFirst >= termAtt.length()) aposFirst = -1;
138150

139-
if (aposFirst < 0 && hyphLast < 0) {
140-
// no changes
141-
return true;
142-
}
143-
// apos is last char, let it run, maybe maths A', D'
144-
if ((aposFirst + 1) == termAtt.length()) {
145-
return true;
146-
}
147-
// hyphen is first char, let it run, maybe linguistic -suffix
148-
if (hyphLast == 0) {
149-
return true;
150-
}
151-
// test prefixes
151+
for (int step = 0; step < MAX_STEPS; step++) {
152+
final int len = termAtt.length();
153+
if (len <= 1) return true;
154+
155+
final char[] buf = termAtt.buffer();
156+
157+
final int hyphLast = lastHyphenIndexAndNormalize(buf, len);
158+
final int aposFirst = firstAposIndexAndNormalize(buf, len);
159+
160+
if (aposFirst < 0 && hyphLast < 0) return true;
161+
162+
// apos is last char, let it run (maths A', D', etc.)
163+
if (aposFirst == len - 1) return true;
164+
165+
// hyphen is first or last char, let it run
166+
if (hyphLast == 0 || hyphLast == len - 1) return true;
167+
168+
// Prefix split on apostrophe
152169
if (aposFirst > 0) {
153170
final int startOffset = offsetAtt.startOffset();
154-
if (PREFIX.containsKey(termAtt.buffer(), 0, aposFirst + 1)) {
155-
final char[] value = PREFIX.get(termAtt.buffer(), 0, aposFirst + 1);
156-
/* Strip prefix ?
157-
if (value == null) {
158-
// skip this prefix, retry to find something
159-
termAtt.copyBuffer(termAtt.buffer(), aposFirst + 1, termAtt.length() - aposFirst - 1);
160-
offsetAtt.setOffset(startOffset + aposFirst + 1, offsetAtt.endOffset());
161-
continue;
162-
}
163-
*/
171+
final int prefixLen = aposFirst + 1;
172+
173+
final char[] value = PREFIX.get(buf, 0, prefixLen);
174+
if (value != null) {
164175
// keep term after prefix for next call
165176
deque.addLast(
166-
termAtt.buffer(),
167-
aposFirst + 1,
168-
termAtt.length() - aposFirst - 1,
169-
startOffset + aposFirst + 1,
177+
buf,
178+
prefixLen,
179+
len - prefixLen,
180+
startOffset + prefixLen,
170181
offsetAtt.endOffset()
171182
);
172183
// send the prefix
173184
termAtt.copyBuffer(value, 0, value.length);
174-
termAtt.setLength(aposFirst + 1);
175-
offsetAtt.setOffset(startOffset, startOffset + aposFirst + 1);
185+
offsetAtt.setOffset(startOffset, startOffset + prefixLen);
176186
return true;
177187
}
178188
}
189+
190+
// Suffix split on hyphen
179191
if (hyphLast > 0) {
180-
// test suffix
181-
if (SUFFIX.containsKey(termAtt.buffer(), hyphLast, termAtt.length() - hyphLast)) {
182-
final char[] value = SUFFIX.get(termAtt.buffer(), hyphLast, termAtt.length() - hyphLast);
183-
// if value is not skipped, add it at start in stack
192+
final int suffixLen = len - hyphLast;
193+
194+
if (SUFFIX.containsKey(buf, hyphLast, suffixLen)) {
195+
final char[] value = SUFFIX.get(buf, hyphLast, suffixLen);
196+
184197
if (value != null) {
185198
deque.addFirst(
186-
value,
187-
0,
199+
value,
200+
0,
188201
value.length,
189-
offsetAtt.startOffset()+hyphLast,
202+
offsetAtt.startOffset() + hyphLast,
190203
offsetAtt.endOffset()
191204
);
192205
}
193-
// set term without suffix, let work the loop
206+
207+
// set term without suffix, loop again (may strip multiple suffixes)
194208
offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset() + hyphLast);
195209
termAtt.setLength(hyphLast);
196210
continue;
197211
}
198212
}
213+
199214
return true; // term is OK like that
200215
}
216+
217+
throw new IllegalStateException("FilterAposHyphenFr: exceeded MAX_STEPS, deque=" + deque);
201218
}
202-
203-
@Override
204-
public void reset() throws IOException
205-
{
206-
super.reset();
219+
220+
private static int firstAposIndexAndNormalize(final char[] buf, final int len) {
221+
for (int i = 0; i < len; i++) {
222+
char c = buf[i];
223+
if (c == '’' || c == '\u02BC') { // U+2019 or U+02BC
224+
buf[i] = '\'';
225+
c = '\'';
226+
}
227+
if (c == '\'') return i;
228+
}
229+
return -1;
230+
}
231+
232+
private static int lastHyphenIndexAndNormalize(final char[] buf, final int len) {
233+
for (int i = len - 1; i >= 0; i--) {
234+
char c = buf[i];
235+
if (c == '\u2010' || c == '\u2011' || c == '\u00AD') { // hyphen variants
236+
buf[i] = '-';
237+
c = '-';
238+
}
239+
if (c == '-') return i;
240+
}
241+
return -1;
207242
}
208243

209244
@Override
210-
public void end() throws IOException
245+
public void reset() throws IOException
211246
{
212-
super.end();
247+
super.reset();
248+
deque.clear(); // add clear() to AttLinkedList (recommended)
213249
}
214-
215250
}

test/src/java/com/github/oeuvres/alix/lucene/analysis/FilterAposHyphenFrTest.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import org.apache.lucene.analysis.Analyzer;
88
import org.apache.lucene.analysis.TokenStream;
99
import org.apache.lucene.analysis.Tokenizer;
10-
import org.apache.lucene.analysis.standard.StandardTokenizer;
1110
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
1211
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
1312
import org.junit.jupiter.api.Test;

0 commit comments

Comments
 (0)