Skip to content

Commit cfe7a4c

Browse files
committed
Isolate a French tokenizer logic with no lucene dependancies
1 parent 40e736b commit cfe7a4c

13 files changed

Lines changed: 970 additions & 375 deletions

File tree

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/CleanupFilter.java

Lines changed: 137 additions & 80 deletions
Large diffs are not rendered by default.

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/LexiconHelper.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import com.github.oeuvres.alix.common.Upos;
1616
import com.github.oeuvres.alix.util.CSVReader;
1717
import com.github.oeuvres.alix.util.Char;
18+
import com.github.oeuvres.alix.util.MweLexicon;
1819
import com.github.oeuvres.alix.util.Report;
1920

2021
import opennlp.tools.postag.POSModel;

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/MweFilter.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
4444
import org.apache.lucene.util.AttributeSource;
4545

46+
import com.github.oeuvres.alix.util.MweLexicon;
47+
4648

4749
/**
4850
* A {@link TokenFilter} that merges multi-word expressions (MWEs) into single tokens,
@@ -179,7 +181,10 @@ private void emitMerged(final int matchPos, final int matchOrd)
179181
// Restore all attributes from first component (posIncr, startOffset, flags, ...).
180182
queue.restoreTo(this, 0);
181183

182-
lexicon.formToAttribute(matchOrd, termAtt);
184+
final int len = lexicon.formLength(matchOrd);
185+
final char[] buf = termAtt.resizeBuffer(len);
186+
lexicon.formToChars(matchOrd, buf, 0);
187+
termAtt.setLength(len);
183188

184189
// Fix endOffset and type.
185190
offsetAtt.setOffset(offsetAtt.startOffset(), endOffset);

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/MweLexicon.java

Lines changed: 0 additions & 289 deletions
This file was deleted.

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/fr/FrenchAnalyzer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,10 @@
5353
import com.github.oeuvres.alix.lucene.analysis.MarkupTokenizer;
5454
import com.github.oeuvres.alix.lucene.analysis.MarkupZoneFilter;
5555
import com.github.oeuvres.alix.lucene.analysis.MweFilter;
56-
import com.github.oeuvres.alix.lucene.analysis.MweLexicon;
5756
import com.github.oeuvres.alix.lucene.analysis.PosTaggingFilter;
5857
import com.github.oeuvres.alix.lucene.analysis.SentenceStartLowerCaseFilter;
5958
import com.github.oeuvres.alix.lucene.analysis.TermReplaceFilter;
59+
import com.github.oeuvres.alix.util.MweLexicon;
6060

6161
import opennlp.tools.postag.POSModel;
6262

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/fr/FrenchLexicons.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@
4444

4545
import com.github.oeuvres.alix.lucene.analysis.LemmaLexicon;
4646
import com.github.oeuvres.alix.lucene.analysis.LexiconHelper;
47-
import com.github.oeuvres.alix.lucene.analysis.MweLexicon;
4847
import com.github.oeuvres.alix.lucene.analysis.LexiconHelper.PosResolver;
48+
import com.github.oeuvres.alix.util.MweLexicon;
4949

5050
public class FrenchLexicons
5151
{

test/src/main/java/com/github/oeuvres/alix/lucene/analysis/fr/FrenchMweFilterDemo.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import com.github.oeuvres.alix.lucene.analysis.LexiconHelper;
55
import com.github.oeuvres.alix.lucene.analysis.MarkupTokenizer;
66
import com.github.oeuvres.alix.lucene.analysis.MweFilter;
7-
import com.github.oeuvres.alix.lucene.analysis.MweLexicon;
7+
import com.github.oeuvres.alix.util.MweLexicon;
88

99
import org.apache.lucene.analysis.Analyzer;
1010
import org.apache.lucene.analysis.TokenStream;

0 commit comments

Comments
 (0)