Skip to content

Commit d1992b2

Browse files
committed
MweLexicon is now free from lucene
1 parent cfe7a4c commit d1992b2

10 files changed

Lines changed: 830 additions & 522 deletions

File tree

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/LexiconHelper.java

Lines changed: 56 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import java.nio.file.Path;
77
import java.util.Collections;
88
import java.util.HashMap;
9+
import java.util.List;
910
import java.util.Map;
1011
import java.util.Objects;
1112

@@ -17,6 +18,7 @@
1718
import com.github.oeuvres.alix.util.Char;
1819
import com.github.oeuvres.alix.util.MweLexicon;
1920
import com.github.oeuvres.alix.util.Report;
21+
import com.github.oeuvres.alix.util.WordTokenizer;
2022

2123
import opennlp.tools.postag.POSModel;
2224

@@ -99,49 +101,30 @@ private abstract static class CsvRowHandler
99101
}
100102

101103
/**
102-
* Load a CSV reader into a {@link MweLexicon}.
104+
* Load one column of expressions
103105
*
104106
* @param lexicon
105-
* @param csv CSV reader
106-
* @param col column index containing the form to add
107-
* @param skipHeader if {@code true}, the first row is skipped
107+
* @param anchor class used to resolve the resource path
108+
* @param resourcePath classpath resource path
108109
* @throws UncheckedIOException on read error
109-
* @throws NullPointerException if {@code lexicon} or {@code csv} is null
110-
* @throws IllegalArgumentException if {@code col < 0}
110+
* @throws NullPointerException if {@code lexicon}, {@code anchor}, or
111+
* {@code resourcePath} is null
111112
*/
112113
public static void loadExpressions(
113114
final MweLexicon lexicon,
114-
final CSVReader csv,
115-
final int colExpression,
116-
final int colCanonical,
117-
final CsvHeader csvHeader)
115+
final WordTokenizer tokenizer,
116+
final Path file
117+
)
118118
{
119-
Objects.requireNonNull(lexicon, "lexicon");
120-
Objects.requireNonNull(csv, "csv");
121-
checkColumnIndex(colExpression, "colExpression");
122-
checkColumnIndex(colCanonical, "colCanonical");
123-
124-
final CsvRowHandler handler = new CsvRowHandler()
125-
{
126-
@Override
127-
protected boolean accept(final CSVReader row) throws UncheckedIOException
128-
{
129-
final StringBuilder expression = row.getCell(colExpression);
130-
if (expression.length() == 0) return false;
131-
final StringBuilder canonical = row.getCell(colCanonical);
132-
if (canonical.length() > 0) {
133-
lexicon.addExpression(expression, canonical);
134-
return true;
135-
}
136-
lexicon.addExpression(expression);
137-
return true;
138-
}
139-
};
140-
141-
forEachDataRow(csv, csvHeader, handler);
119+
Objects.requireNonNull(file, "file");
120+
try (CSVReader csv = new CSVReader(file, ',', 2)) {
121+
loadExpressions(lexicon, tokenizer, csv, 0, 1, CsvHeader.SKIP);
122+
} catch (IOException e) {
123+
throw new UncheckedIOException(e);
124+
}
142125
}
143126

144-
127+
145128
/**
146129
* Load one column of expressions
147130
*
@@ -154,44 +137,67 @@ protected boolean accept(final CSVReader row) throws UncheckedIOException
154137
*/
155138
public static void loadExpressions(
156139
final MweLexicon lexicon,
140+
final WordTokenizer tokenizer,
157141
final Class<?> anchor,
158142
final String resourcePath
159143
)
160144
{
161145
Objects.requireNonNull(anchor, "anchor");
162146
Objects.requireNonNull(resourcePath, "resourcePath");
163147
try (CSVReader csv = new CSVReader(anchor, resourcePath, ',', 2)) {
164-
loadExpressions(lexicon, csv, 0, 1, CsvHeader.SKIP);
148+
loadExpressions(lexicon, tokenizer, csv, 0, 1, CsvHeader.SKIP);
165149
} catch (IOException e) {
166150
throw new UncheckedIOException(e);
167151
}
168152
}
169153

170154
/**
171-
* Load one column of expressions
155+
* Load a CSV reader into a {@link MweLexicon}.
172156
*
173157
* @param lexicon
174-
* @param anchor class used to resolve the resource path
175-
* @param resourcePath classpath resource path
158+
* @param csv CSV reader
159+
* @param col column index containing the form to add
160+
* @param skipHeader if {@code true}, the first row is skipped
176161
* @throws UncheckedIOException on read error
177-
* @throws NullPointerException if {@code lexicon}, {@code anchor}, or
178-
* {@code resourcePath} is null
162+
* @throws NullPointerException if {@code lexicon} or {@code csv} is null
163+
* @throws IllegalArgumentException if {@code col < 0}
179164
*/
180165
public static void loadExpressions(
181166
final MweLexicon lexicon,
182-
final Path file
183-
)
167+
final WordTokenizer tokenizer,
168+
final CSVReader csv,
169+
final int colExpression,
170+
final int colCanonical,
171+
final CsvHeader csvHeader)
184172
{
185-
Objects.requireNonNull(file, "file");
186-
try (CSVReader csv = new CSVReader(file, ',', 2)) {
187-
loadExpressions(lexicon, csv, 0, 1, CsvHeader.SKIP);
188-
} catch (IOException e) {
189-
throw new UncheckedIOException(e);
190-
}
173+
Objects.requireNonNull(lexicon, "lexicon");
174+
Objects.requireNonNull(csv, "csv");
175+
checkColumnIndex(colExpression, "colExpression");
176+
checkColumnIndex(colCanonical, "colCanonical");
177+
178+
final CsvRowHandler handler = new CsvRowHandler()
179+
{
180+
@Override
181+
protected boolean accept(final CSVReader row) throws UncheckedIOException
182+
{
183+
final StringBuilder expression = row.getCell(colExpression);
184+
if (expression.length() == 0) return false;
185+
final StringBuilder canonical = row.getCell(colCanonical);
186+
List<String> words = tokenizer.tokenize(expression);
187+
188+
if (canonical.length() > 0) {
189+
lexicon.addExpression(words, canonical);
190+
} else {
191+
lexicon.addExpression(words, expression);
192+
}
193+
return true;
194+
}
195+
};
196+
197+
forEachDataRow(csv, csvHeader, handler);
191198
}
192-
193199

194-
200+
195201
/**
196202
* Load a 2-column CSV reader into a {@link CharArrayMap}.
197203
* <p>

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/fr/FrenchAnalyzer.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@
5757
import com.github.oeuvres.alix.lucene.analysis.SentenceStartLowerCaseFilter;
5858
import com.github.oeuvres.alix.lucene.analysis.TermReplaceFilter;
5959
import com.github.oeuvres.alix.util.MweLexicon;
60+
import com.github.oeuvres.alix.util.WordTokenizer;
61+
import com.github.oeuvres.alix.util.fr.FrenchCliticTokenizer;
6062

6163
import opennlp.tools.postag.POSModel;
6264

@@ -112,8 +114,9 @@ public void addNormalizations(List<Path> files) throws IOException {
112114
}
113115
}
114116
public void addExpressions(List<Path> files) throws IOException {
117+
WordTokenizer tokenizer = new FrenchCliticTokenizer();
115118
for (Path path: files) {
116-
LexiconHelper.loadExpressions(expressions, path);
119+
LexiconHelper.loadExpressions(expressions, tokenizer, path);
117120
}
118121
expressions.freeze();
119122
}

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/fr/FrenchLexicons.java

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -35,17 +35,15 @@
3535

3636
import java.util.Map;
3737

38-
import org.apache.lucene.analysis.Analyzer;
3938
import org.apache.lucene.analysis.CharArrayMap;
4039
import org.apache.lucene.analysis.CharArraySet;
41-
import org.apache.lucene.analysis.TokenStream;
42-
import org.apache.lucene.analysis.Tokenizer;
43-
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
4440

4541
import com.github.oeuvres.alix.lucene.analysis.LemmaLexicon;
4642
import com.github.oeuvres.alix.lucene.analysis.LexiconHelper;
4743
import com.github.oeuvres.alix.lucene.analysis.LexiconHelper.PosResolver;
4844
import com.github.oeuvres.alix.util.MweLexicon;
45+
import com.github.oeuvres.alix.util.WordTokenizer;
46+
import com.github.oeuvres.alix.util.fr.FrenchCliticTokenizer;
4947

5048
public class FrenchLexicons
5149
{
@@ -129,15 +127,9 @@ protected String posRewrite(String posName)
129127

130128
public static MweLexicon buildMweLexicon()
131129
{
132-
MweLexicon lexicon = new MweLexicon(new Analyzer() {
133-
@Override
134-
protected TokenStreamComponents createComponents(String fieldName) {
135-
Tokenizer tokenizer = new WhitespaceTokenizer();
136-
TokenStream ts = new FrenchCliticSplitFilter(tokenizer);
137-
return new TokenStreamComponents(tokenizer, ts);
138-
}
139-
}, "mwe", 2000);
140-
LexiconHelper.loadExpressions(lexicon, LexiconHelper.class, "/com/github/oeuvres/alix/fr/expressions.csv");
130+
MweLexicon lexicon = new MweLexicon(2000);
131+
WordTokenizer tokenizer = new FrenchCliticTokenizer();
132+
LexiconHelper.loadExpressions(lexicon, tokenizer, LexiconHelper.class, "/com/github/oeuvres/alix/fr/expressions.csv");
141133
return lexicon;
142134
}
143135

test/src/main/java/com/github/oeuvres/alix/lucene/analysis/fr/FrenchMweFilterDemo.java

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,12 @@
55
import com.github.oeuvres.alix.lucene.analysis.MarkupTokenizer;
66
import com.github.oeuvres.alix.lucene.analysis.MweFilter;
77
import com.github.oeuvres.alix.util.MweLexicon;
8+
import com.github.oeuvres.alix.util.WordTokenizer;
9+
import com.github.oeuvres.alix.util.fr.FrenchCliticTokenizer;
810

911
import org.apache.lucene.analysis.Analyzer;
1012
import org.apache.lucene.analysis.TokenStream;
1113
import org.apache.lucene.analysis.Tokenizer;
12-
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
1314

1415
import java.util.List;
1516

@@ -39,8 +40,10 @@ private FrenchMweFilterDemo() {}
3940
) );
4041

4142
public static void main(String[] args) throws Exception {
42-
MweLexicon lexicon = new MweLexicon(expressionAnalyzer(), "mwe", 2000);
43-
LexiconHelper.loadExpressions(lexicon, LexiconHelper.class, "/com/github/oeuvres/alix/fr/expressions.csv");
43+
MweLexicon lexicon = new MweLexicon(2000);
44+
WordTokenizer tokenizer = new FrenchCliticTokenizer();
45+
46+
LexiconHelper.loadExpressions(lexicon, tokenizer, LexiconHelper.class, "/com/github/oeuvres/alix/fr/expressions.csv");
4447
lexicon.freeze();
4548
try (Analyzer analyzer = buildAnalyzer(lexicon)) {
4649

@@ -51,17 +54,6 @@ public static void main(String[] args) throws Exception {
5154
}
5255
}
5356

54-
private static Analyzer expressionAnalyzer() {
55-
return new Analyzer() {
56-
@Override
57-
protected TokenStreamComponents createComponents(String fieldName) {
58-
Tokenizer tokenizer = new WhitespaceTokenizer();
59-
TokenStream ts = new FrenchCliticSplitFilter(tokenizer);
60-
return new TokenStreamComponents(tokenizer, ts);
61-
}
62-
};
63-
}
64-
6557

6658
/** Minimal Analyzer for MLTokenizer -> FrenchCliticSplitFilter. */
6759
private static Analyzer buildAnalyzer(MweLexicon lexicon) {

0 commit comments

Comments
 (0)