1+ /*
2+ * Alix, A Lucene Indexer for XML documents.
3+ *
4+ * Copyright 2016 Frédéric Glorieux <frederic.glorieux@fictif.org>
5+ * Copyright 2009 Pierre Dittgen <pierre@dittgen.org>
6+ * Frédéric Glorieux <frederic.glorieux@fictif.org>
7+ *
8+ * Alix is a java library to index and search XML text documents
9+ * with Lucene https://lucene.apache.org/core/
10+ * including linguistic tools for French,
11+ * available under Apache license.
12+ *
13+ * Alix has been started in 2009 under the javacrim project
14+ * https://sf.net/projects/javacrim/
15+ * for a java course at Inalco http://www.er-tim.fr/
16+ * Alix continues the concepts of SDX under another licence
17+ * «Système de Documentation XML»
18+ * 2000-2010 Ministère de la culture et de la communication (France), AJLSM.
19+ * http://savannah.nongnu.org/projects/sdx/
20+ *
21+ * Licensed under the Apache License, Version 2.0 (the "License");
22+ * you may not use this file except in compliance with the License.
23+ * You may obtain a copy of the License at
24+ *
25+ * http://www.apache.org/licenses/LICENSE-2.0
26+ *
27+ * Unless required by applicable law or agreed to in writing, software
28+ * distributed under the License is distributed on an "AS IS" BASIS,
29+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
30+ * See the License for the specific language governing permissions and
31+ * limitations under the License.
32+ */
133package com .github .oeuvres .alix .lucene .analysis ;
234
335import java .io .IOException ;
1042import org .apache .lucene .analysis .tokenattributes .OffsetAttribute ;
1143
1244import static com .github .oeuvres .alix .common .Upos .*;
45+
1346/**
1447 * A filter that decomposes words on a list of suffixes and prefixes, mainly to handle
1548 * hyphenation and apostrophe ellision in French. The original token is broken and lost,
1649 * offset are precisely kept, so that word counting and stats are not biased by multiple
1750 * words on same positions.
1851 *
52+ * https://fr.wikipedia.org/wiki/Emploi_du_trait_d%27union_pour_les_pr%C3%A9fixes_en_fran%C3%A7ais
53+ *
1954 * Known side effect : qu’en-dira-t-on, donne-m’en, emmène-m’y.
2055 */
2156public class FilterAposHyphenFr extends TokenFilter
2257{
58+ private static final int MAX_STEPS = 16 ;
59+
2360 /** The term provided by the Tokenizer */
2461 private final CharTermAttribute termAtt = addAttribute (CharTermAttribute .class );
2562 /** Char index in source text. */
@@ -28,23 +65,21 @@ public class FilterAposHyphenFr extends TokenFilter
2865 private final FlagsAttribute flagsAtt = addAttribute (FlagsAttribute .class );
2966 /** Stack of stored states */
3067 private final AttLinkedList deque = new AttLinkedList ();
31-
32-
68+
3369 /** Ellisions prefix */
34- static CharArrayMap <char []> PREFIX = new CharArrayMap <>(30 , false );
35- static { // ellisions
70+ private static final CharArrayMap <char []> PREFIX = new CharArrayMap <>(30 , false );
71+ static {
3672 PREFIX .put ("d'" , "de" .toCharArray ());
37- PREFIX .put ("d'" , "de" .toCharArray ()); // keep ' for locution, like d’abord
3873 PREFIX .put ("D'" , "de" .toCharArray ());
39- PREFIX .put ("j'" , "je" .toCharArray ()); // j’aime.
74+ PREFIX .put ("j'" , "je" .toCharArray ());
4075 PREFIX .put ("J'" , "je" .toCharArray ());
4176 PREFIX .put ("jusqu'" , "jusque" .toCharArray ());
4277 PREFIX .put ("Jusqu'" , "jusque" .toCharArray ());
4378 PREFIX .put ("l'" , "l'" .toCharArray ()); // je l’aime. le ou la
4479 PREFIX .put ("L'" , "l'" .toCharArray ());
4580 PREFIX .put ("lorsqu'" , "lorsque" .toCharArray ());
4681 PREFIX .put ("Lorsqu'" , "lorsque" .toCharArray ());
47- PREFIX .put ("m'" , "me" .toCharArray ()); // il m’aime.
82+ PREFIX .put ("m'" , "me" .toCharArray ());
4883 PREFIX .put ("M'" , "me" .toCharArray ());
4984 PREFIX .put ("n'" , "ne" .toCharArray ()); // N’y va pas.
5085 PREFIX .put ("N'" , "ne" .toCharArray ());
@@ -56,15 +91,16 @@ public class FilterAposHyphenFr extends TokenFilter
5691 PREFIX .put ("Quelqu'" , "quelque" .toCharArray ());
5792 PREFIX .put ("quoiqu'" , "quoique" .toCharArray ());
5893 PREFIX .put ("Quoiqu'" , "quoique" .toCharArray ());
59- PREFIX .put ("s'" , "se" .toCharArray ()); // il s’aime.
94+ PREFIX .put ("s'" , "se" .toCharArray ());
6095 PREFIX .put ("S'" , "se" .toCharArray ());
61- PREFIX .put ("t'" , "te" .toCharArray ()); // il t’aime.
96+ PREFIX .put ("t'" , "te" .toCharArray ());
6297 PREFIX .put ("T'" , "te" .toCharArray ());
6398 }
64- // https://fr.wikipedia.org/wiki/Emploi_du_trait_d%27union_pour_les_pr%C3%A9fixes_en_fran%C3%A7ais
99+
65100 /** Hyphen suffixes */
66- static final CharArrayMap <char []> SUFFIX = new CharArrayMap <>(30 , false );
101+ private static final CharArrayMap <char []> SUFFIX = new CharArrayMap <>(30 , false );
67102 static {
103+
68104 SUFFIX .put ("-ce" , "ce" .toCharArray ()); // Serait-ce ?
69105 SUFFIX .put ("-ci" , null ); // cette année-ci, ceux-ci.
70106 SUFFIX .put ("-elle" , "elle" .toCharArray ()); // dit-elle.
@@ -92,124 +128,123 @@ public class FilterAposHyphenFr extends TokenFilter
92128 SUFFIX .put ("-y" , "y" .toCharArray ()); // allons-y.
93129 }
94130
95-
96-
97- /**
98- * Default constructor.
99- * @param input previous filter.
100- */
101131 public FilterAposHyphenFr (TokenStream input ) {
102132 super (input );
103133 }
104134
105135 @ Override
106136 public final boolean incrementToken () throws IOException
107137 {
108- // check if a term has been stored from last call
138+ // Emit buffered tokens first
109139 if (!deque .isEmpty ()) {
110140 deque .removeFirst (termAtt , offsetAtt );
111141 }
112142 else {
113- if (!input .incrementToken ()) {
114- // end of stream
115- return false ;
116- }
143+ if (!input .incrementToken ()) return false ;
117144 }
145+
118146 // do not try to split in XML tags
119147 if (flagsAtt .getFlags () == XML .code ) {
120148 return true ;
121149 }
122- int loop = 0 ;
123- while (true ) {
124- if (++loop > 10 ) {
125- throw new IOException ("AposHyph décon: " + deque );
126- }
127- char [] chars = termAtt .buffer ();
128- int hyphLast = termAtt .length () - 1 ;
129- for (; hyphLast >= 0 ; hyphLast --) {
130- if ('-' == chars [hyphLast ]) break ;
131- }
132- int aposFirst = 0 ;
133- for (; aposFirst < termAtt .length (); aposFirst ++) {
134- if (chars [aposFirst ] == '’' ) chars [aposFirst ] = '\'' ;
135- if ('\'' == chars [aposFirst ]) break ;
136- }
137- if (aposFirst >= termAtt .length ()) aposFirst = -1 ;
138150
139- if (aposFirst < 0 && hyphLast < 0 ) {
140- // no changes
141- return true ;
142- }
143- // apos is last char, let it run, maybe maths A', D'
144- if ((aposFirst + 1 ) == termAtt .length ()) {
145- return true ;
146- }
147- // hyphen is first char, let it run, maybe linguistic -suffix
148- if (hyphLast == 0 ) {
149- return true ;
150- }
151- // test prefixes
151+ for (int step = 0 ; step < MAX_STEPS ; step ++) {
152+ final int len = termAtt .length ();
153+ if (len <= 1 ) return true ;
154+
155+ final char [] buf = termAtt .buffer ();
156+
157+ final int hyphLast = lastHyphenIndexAndNormalize (buf , len );
158+ final int aposFirst = firstAposIndexAndNormalize (buf , len );
159+
160+ if (aposFirst < 0 && hyphLast < 0 ) return true ;
161+
162+ // apos is last char, let it run (maths A', D', etc.)
163+ if (aposFirst == len - 1 ) return true ;
164+
165+ // hyphen is first or last char, let it run
166+ if (hyphLast == 0 || hyphLast == len - 1 ) return true ;
167+
168+ // Prefix split on apostrophe
152169 if (aposFirst > 0 ) {
153170 final int startOffset = offsetAtt .startOffset ();
154- if (PREFIX .containsKey (termAtt .buffer (), 0 , aposFirst + 1 )) {
155- final char [] value = PREFIX .get (termAtt .buffer (), 0 , aposFirst + 1 );
156- /* Strip prefix ?
157- if (value == null) {
158- // skip this prefix, retry to find something
159- termAtt.copyBuffer(termAtt.buffer(), aposFirst + 1, termAtt.length() - aposFirst - 1);
160- offsetAtt.setOffset(startOffset + aposFirst + 1, offsetAtt.endOffset());
161- continue;
162- }
163- */
171+ final int prefixLen = aposFirst + 1 ;
172+
173+ final char [] value = PREFIX .get (buf , 0 , prefixLen );
174+ if (value != null ) {
164175 // keep term after prefix for next call
165176 deque .addLast (
166- termAtt . buffer (),
167- aposFirst + 1 ,
168- termAtt . length () - aposFirst - 1 ,
169- startOffset + aposFirst + 1 ,
177+ buf ,
178+ prefixLen ,
179+ len - prefixLen ,
180+ startOffset + prefixLen ,
170181 offsetAtt .endOffset ()
171182 );
172183 // send the prefix
173184 termAtt .copyBuffer (value , 0 , value .length );
174- termAtt .setLength (aposFirst + 1 );
175- offsetAtt .setOffset (startOffset , startOffset + aposFirst + 1 );
185+ offsetAtt .setOffset (startOffset , startOffset + prefixLen );
176186 return true ;
177187 }
178188 }
189+
190+ // Suffix split on hyphen
179191 if (hyphLast > 0 ) {
180- // test suffix
181- if (SUFFIX .containsKey (termAtt .buffer (), hyphLast , termAtt .length () - hyphLast )) {
182- final char [] value = SUFFIX .get (termAtt .buffer (), hyphLast , termAtt .length () - hyphLast );
183- // if value is not skipped, add it at start in stack
192+ final int suffixLen = len - hyphLast ;
193+
194+ if (SUFFIX .containsKey (buf , hyphLast , suffixLen )) {
195+ final char [] value = SUFFIX .get (buf , hyphLast , suffixLen );
196+
184197 if (value != null ) {
185198 deque .addFirst (
186- value ,
187- 0 ,
199+ value ,
200+ 0 ,
188201 value .length ,
189- offsetAtt .startOffset ()+ hyphLast ,
202+ offsetAtt .startOffset () + hyphLast ,
190203 offsetAtt .endOffset ()
191204 );
192205 }
193- // set term without suffix, let work the loop
206+
207+ // set term without suffix, loop again (may strip multiple suffixes)
194208 offsetAtt .setOffset (offsetAtt .startOffset (), offsetAtt .startOffset () + hyphLast );
195209 termAtt .setLength (hyphLast );
196210 continue ;
197211 }
198212 }
213+
199214 return true ; // term is OK like that
200215 }
216+
217+ throw new IllegalStateException ("FilterAposHyphenFr: exceeded MAX_STEPS, deque=" + deque );
201218 }
202-
203- @ Override
204- public void reset () throws IOException
205- {
206- super .reset ();
219+
220+ private static int firstAposIndexAndNormalize (final char [] buf , final int len ) {
221+ for (int i = 0 ; i < len ; i ++) {
222+ char c = buf [i ];
223+ if (c == '’' || c == '\u02BC' ) { // U+2019 or U+02BC
224+ buf [i ] = '\'' ;
225+ c = '\'' ;
226+ }
227+ if (c == '\'' ) return i ;
228+ }
229+ return -1 ;
230+ }
231+
232+ private static int lastHyphenIndexAndNormalize (final char [] buf , final int len ) {
233+ for (int i = len - 1 ; i >= 0 ; i --) {
234+ char c = buf [i ];
235+ if (c == '\u2010' || c == '\u2011' || c == '\u00AD' ) { // hyphen variants
236+ buf [i ] = '-' ;
237+ c = '-' ;
238+ }
239+ if (c == '-' ) return i ;
240+ }
241+ return -1 ;
207242 }
208243
209244 @ Override
210- public void end () throws IOException
245+ public void reset () throws IOException
211246 {
212- super .end ();
247+ super .reset ();
248+ deque .clear (); // add clear() to AttLinkedList (recommended)
213249 }
214-
215250}
0 commit comments