oeuvres
diff --git a/‎analysis/src/java/com/github/oeuvres/alix/lucene/analysis/LexiconHelper.java‎
Lines changed: 56 additions & 50 deletions b/‎analysis/src/java/com/github/oeuvres/alix/lucene/analysis/LexiconHelper.java‎
Lines changed: 56 additions & 50 deletions
diff --git a/‎analysis/src/java/com/github/oeuvres/alix/lucene/analysis/fr/FrenchAnalyzer.java‎
Lines changed: 4 additions & 1 deletion b/‎analysis/src/java/com/github/oeuvres/alix/lucene/analysis/fr/FrenchAnalyzer.java‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎analysis/src/java/com/github/oeuvres/alix/lucene/analysis/fr/FrenchLexicons.java‎
Lines changed: 5 additions & 13 deletions b/‎analysis/src/java/com/github/oeuvres/alix/lucene/analysis/fr/FrenchLexicons.java‎
Lines changed: 5 additions & 13 deletions
diff --git a/‎test/src/main/java/com/github/oeuvres/alix/lucene/analysis/fr/FrenchMweFilterDemo.java‎
Lines changed: 6 additions & 14 deletions b/‎test/src/main/java/com/github/oeuvres/alix/lucene/analysis/fr/FrenchMweFilterDemo.java‎
Lines changed: 6 additions & 14 deletions
@@ -6,6 +6,7 @@
 import java.nio.file.Path;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 
@@ -17,6 +18,7 @@
 import com.github.oeuvres.alix.util.Char;
 import com.github.oeuvres.alix.util.MweLexicon;
 import com.github.oeuvres.alix.util.Report;
+import com.github.oeuvres.alix.util.WordTokenizer;
 
 import opennlp.tools.postag.POSModel;
 
@@ -99,49 +101,30 @@ private abstract static class CsvRowHandler
     }
 
     /**
-     * Load a CSV reader into a {@link MweLexicon}.
+     * Load one column of expressions
      *
      * @param lexicon
-     * @param csv        CSV reader
-     * @param col        column index containing the form to add
-     * @param skipHeader if {@code true}, the first row is skipped
+     * @param anchor       class used to resolve the resource path
+     * @param resourcePath classpath resource path
      * @throws UncheckedIOException              on read error
-     * @throws NullPointerException     if {@code lexicon} or {@code csv} is null
-     * @throws IllegalArgumentException if {@code col < 0}
+     * @throws NullPointerException     if {@code lexicon}, {@code anchor}, or
+     *                                  {@code resourcePath} is null
      */
     public static void loadExpressions(
         final MweLexicon lexicon,
-        final CSVReader csv,
-        final int colExpression,
-        final int colCanonical,
-        final CsvHeader csvHeader)
+        final WordTokenizer tokenizer,
+        final Path file
+    )
     {
-        Objects.requireNonNull(lexicon, "lexicon");
-        Objects.requireNonNull(csv, "csv");
-        checkColumnIndex(colExpression, "colExpression");
-        checkColumnIndex(colCanonical, "colCanonical");
-        
-        final CsvRowHandler handler = new CsvRowHandler()
-        {
-            @Override
-            protected boolean accept(final CSVReader row) throws UncheckedIOException
-            {
-                final StringBuilder expression = row.getCell(colExpression);
-                if (expression.length() == 0) return false;
-                final StringBuilder canonical = row.getCell(colCanonical);
-                if (canonical.length() > 0) {
-                    lexicon.addExpression(expression, canonical);
-                    return true;
-                }
-                lexicon.addExpression(expression);
-                return true;
-            }
-        };
-        
-        forEachDataRow(csv, csvHeader, handler);
+        Objects.requireNonNull(file, "file");
+        try (CSVReader csv = new CSVReader(file, ',', 2)) {
+            loadExpressions(lexicon, tokenizer, csv, 0, 1, CsvHeader.SKIP);
+        } catch (IOException e) {
+            throw new UncheckedIOException(e);
+        }
     }
 
-    
+
     /**
      * Load one column of expressions
      *
@@ -154,44 +137,67 @@ protected boolean accept(final CSVReader row) throws UncheckedIOException
      */
     public static void loadExpressions(
         final MweLexicon lexicon,
+        final WordTokenizer tokenizer,
         final Class<?> anchor,
         final String resourcePath
     )
     {
         Objects.requireNonNull(anchor, "anchor");
         Objects.requireNonNull(resourcePath, "resourcePath");
         try (CSVReader csv = new CSVReader(anchor, resourcePath, ',', 2)) {
-            loadExpressions(lexicon, csv, 0, 1, CsvHeader.SKIP);
+            loadExpressions(lexicon, tokenizer, csv, 0, 1, CsvHeader.SKIP);
         } catch (IOException e) {
             throw new UncheckedIOException(e);
         }
     }
 
     /**
-     * Load one column of expressions
+     * Load a CSV reader into a {@link MweLexicon}.
      *
      * @param lexicon
-     * @param anchor       class used to resolve the resource path
-     * @param resourcePath classpath resource path
+     * @param csv        CSV reader
+     * @param col        column index containing the form to add
+     * @param skipHeader if {@code true}, the first row is skipped
      * @throws UncheckedIOException              on read error
-     * @throws NullPointerException     if {@code lexicon}, {@code anchor}, or
-     *                                  {@code resourcePath} is null
+     * @throws NullPointerException     if {@code lexicon} or {@code csv} is null
+     * @throws IllegalArgumentException if {@code col < 0}
      */
     public static void loadExpressions(
         final MweLexicon lexicon,
-        final Path file
-    )
+        final WordTokenizer tokenizer,
+        final CSVReader csv,
+        final int colExpression,
+        final int colCanonical,
+        final CsvHeader csvHeader)
     {
-        Objects.requireNonNull(file, "file");
-        try (CSVReader csv = new CSVReader(file, ',', 2)) {
-            loadExpressions(lexicon, csv, 0, 1, CsvHeader.SKIP);
-        } catch (IOException e) {
-            throw new UncheckedIOException(e);
-        }
+        Objects.requireNonNull(lexicon, "lexicon");
+        Objects.requireNonNull(csv, "csv");
+        checkColumnIndex(colExpression, "colExpression");
+        checkColumnIndex(colCanonical, "colCanonical");
+        
+        final CsvRowHandler handler = new CsvRowHandler()
+        {
+            @Override
+            protected boolean accept(final CSVReader row) throws UncheckedIOException
+            {
+                final StringBuilder expression = row.getCell(colExpression);
+                if (expression.length() == 0) return false;
+                final StringBuilder canonical = row.getCell(colCanonical);
+                List<String> words = tokenizer.tokenize(expression);
+                
+                if (canonical.length() > 0) {
+                    lexicon.addExpression(words, canonical);
+                } else {
+                    lexicon.addExpression(words, expression);
+                }
+                return true;
+            }
+        };
+        
+        forEachDataRow(csv, csvHeader, handler);
     }
-    
 
-    
+
     /**
      * Load a 2-column CSV reader into a {@link CharArrayMap}.
      * <p>
 
@@ -57,6 +57,8 @@
 import com.github.oeuvres.alix.lucene.analysis.SentenceStartLowerCaseFilter;
 import com.github.oeuvres.alix.lucene.analysis.TermReplaceFilter;
 import com.github.oeuvres.alix.util.MweLexicon;
+import com.github.oeuvres.alix.util.WordTokenizer;
+import com.github.oeuvres.alix.util.fr.FrenchCliticTokenizer;
 
 import opennlp.tools.postag.POSModel;
 
@@ -112,8 +114,9 @@ public void addNormalizations(List<Path> files) throws IOException {
         }
     }
     public void addExpressions(List<Path> files) throws IOException {
+        WordTokenizer tokenizer = new FrenchCliticTokenizer();
         for (Path path: files) {
-            LexiconHelper.loadExpressions(expressions, path);
+            LexiconHelper.loadExpressions(expressions, tokenizer, path);
         }
         expressions.freeze();
     }
 
@@ -35,17 +35,15 @@
 
 import java.util.Map;
 
-import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArrayMap;
 import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 
 import com.github.oeuvres.alix.lucene.analysis.LemmaLexicon;
 import com.github.oeuvres.alix.lucene.analysis.LexiconHelper;
 import com.github.oeuvres.alix.lucene.analysis.LexiconHelper.PosResolver;
 import com.github.oeuvres.alix.util.MweLexicon;
+import com.github.oeuvres.alix.util.WordTokenizer;
+import com.github.oeuvres.alix.util.fr.FrenchCliticTokenizer;
 
 public class FrenchLexicons
 {
@@ -129,15 +127,9 @@ protected String posRewrite(String posName)
 
     public static MweLexicon buildMweLexicon()
     {
-        MweLexicon lexicon = new MweLexicon(new Analyzer() {
-            @Override
-            protected TokenStreamComponents createComponents(String fieldName) {
-                Tokenizer tokenizer = new WhitespaceTokenizer();
-                TokenStream ts = new FrenchCliticSplitFilter(tokenizer);
-                return new TokenStreamComponents(tokenizer, ts);
-            }
-        }, "mwe", 2000);
-        LexiconHelper.loadExpressions(lexicon, LexiconHelper.class, "/com/github/oeuvres/alix/fr/expressions.csv");
+        MweLexicon lexicon = new MweLexicon(2000);
+        WordTokenizer tokenizer = new FrenchCliticTokenizer();
+        LexiconHelper.loadExpressions(lexicon, tokenizer, LexiconHelper.class, "/com/github/oeuvres/alix/fr/expressions.csv");
         return lexicon;
     }
 
 
@@ -5,11 +5,12 @@
 import com.github.oeuvres.alix.lucene.analysis.MarkupTokenizer;
 import com.github.oeuvres.alix.lucene.analysis.MweFilter;
 import com.github.oeuvres.alix.util.MweLexicon;
+import com.github.oeuvres.alix.util.WordTokenizer;
+import com.github.oeuvres.alix.util.fr.FrenchCliticTokenizer;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 
 import java.util.List;
 
@@ -39,8 +40,10 @@ private FrenchMweFilterDemo() {}
         )    );
 
     public static void main(String[] args) throws Exception {
-        MweLexicon lexicon = new MweLexicon(expressionAnalyzer(), "mwe", 2000);
-        LexiconHelper.loadExpressions(lexicon, LexiconHelper.class, "/com/github/oeuvres/alix/fr/expressions.csv");
+        MweLexicon lexicon = new MweLexicon(2000);
+        WordTokenizer tokenizer = new FrenchCliticTokenizer();
+
+        LexiconHelper.loadExpressions(lexicon, tokenizer, LexiconHelper.class, "/com/github/oeuvres/alix/fr/expressions.csv");
         lexicon.freeze();
         try (Analyzer analyzer = buildAnalyzer(lexicon)) {
 
@@ -51,17 +54,6 @@ public static void main(String[] args) throws Exception {
         }
     }
 
-    private static Analyzer expressionAnalyzer() {
-        return new Analyzer() {
-            @Override
-            protected TokenStreamComponents createComponents(String fieldName) {
-                Tokenizer tokenizer = new WhitespaceTokenizer();
-                TokenStream ts = new FrenchCliticSplitFilter(tokenizer);
-                return new TokenStreamComponents(tokenizer, ts);
-            }
-        };
-    }
-
 
     /** Minimal Analyzer for MLTokenizer -> FrenchCliticSplitFilter. */
     private static Analyzer buildAnalyzer(MweLexicon lexicon) {
Original file line number	Diff line number	Diff line change
`@@ -57,6 +57,8 @@`
`57`	`57`	`import com.github.oeuvres.alix.lucene.analysis.SentenceStartLowerCaseFilter;`
`58`	`58`	`import com.github.oeuvres.alix.lucene.analysis.TermReplaceFilter;`
`59`	`59`	`import com.github.oeuvres.alix.util.MweLexicon;`
	`60`	`+import com.github.oeuvres.alix.util.WordTokenizer;`
	`61`	`+import com.github.oeuvres.alix.util.fr.FrenchCliticTokenizer;`
`60`	`62`
`61`	`63`	`import opennlp.tools.postag.POSModel;`
`62`	`64`
`@@ -112,8 +114,9 @@ public void addNormalizations(List<Path> files) throws IOException {`
`112`	`114`	`}`
`113`	`115`	`}`
`114`	`116`	`public void addExpressions(List<Path> files) throws IOException {`
	`117`	`+ WordTokenizer tokenizer = new FrenchCliticTokenizer();`
`115`	`118`	`for (Path path: files) {`
`116`		`- LexiconHelper.loadExpressions(expressions, path);`
	`119`	`+ LexiconHelper.loadExpressions(expressions, tokenizer, path);`
`117`	`120`	`}`
`118`	`121`	`expressions.freeze();`
`119`	`122`	`}`