Skip to content

Commit bbd279a

Browse files
committed
LemmaLexicon as a generic util
1 parent d1992b2 commit bbd279a

15 files changed

Lines changed: 1085 additions & 1218 deletions

File tree

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/LemmaFilter.java

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
import com.github.oeuvres.alix.common.Upos;
4747
import com.github.oeuvres.alix.lucene.analysis.tokenattributes.LemmaAttribute;
4848
import com.github.oeuvres.alix.lucene.analysis.tokenattributes.PosAttribute;
49+
import com.github.oeuvres.alix.util.LemmaLexicon;
4950

5051

5152
/**
@@ -130,7 +131,7 @@ public final class LemmaFilter extends TokenFilter
130131
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
131132
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
132133
private final PosAttribute posAtt = addAttribute(PosAttribute.class);
133-
private final LemmaAttribute lemAtt = addAttribute(LemmaAttribute.class);
134+
// private final LemmaAttribute lemAtt = addAttribute(LemmaAttribute.class);
134135

135136
/**
136137
* Creates a lemmatization side-channel filter.
@@ -160,9 +161,6 @@ public boolean incrementToken() throws IOException
160161
{
161162
if (!input.incrementToken()) return false;
162163

163-
// Invariant: lemma slot is empty unless this filter resolves and writes one.
164-
lemAtt.setEmpty();
165-
166164
if (keywordAtt.isKeyword()) return true;
167165

168166
final int posId = posAtt.getPos();
@@ -176,22 +174,25 @@ public boolean incrementToken() throws IOException
176174
}
177175

178176
// Surface known ?
179-
final int formId = lex.findFormId(termAtt);
177+
final int formId = lex.id(termAtt);
180178
if (formId < 0) return true;
181179

182180
// Lookup with pos
183-
int lemmaId = (posId >= 0) ? lex.findLemmaId(formId, posId) : -1;
181+
int lemmaId = (posId >= 0) ? lex.lemmaId(formId, posId) : -1;
184182

185183
// Default lemma (pos-agnostic)
186184
if (lemmaId < 0) {
187-
lemmaId = lex.findLemmaId(formId); // returns -1 if none
185+
lemmaId = lex.lemmaId(formId); // returns -1 if none
188186
}
189187

190188
// Nothing usable
191189
if (lemmaId < 0 || lemmaId == formId) return true;
192190

193-
// Copy lemma
194-
lex.copyForm(lemmaId, termAtt);
191+
// Copy lemma to term for indexation
192+
final int len = lex.length(lemmaId);
193+
final char[] dst = termAtt.resizeBuffer(len);
194+
lex.copy(lemmaId, dst, 0);
195+
termAtt.setLength(len);
195196

196197
return true;
197198
}

0 commit comments

Comments
 (0)