66import java .nio .file .Path ;
77import java .util .Collections ;
88import java .util .HashMap ;
9+ import java .util .List ;
910import java .util .Map ;
1011import java .util .Objects ;
1112
1718import com .github .oeuvres .alix .util .Char ;
1819import com .github .oeuvres .alix .util .MweLexicon ;
1920import com .github .oeuvres .alix .util .Report ;
21+ import com .github .oeuvres .alix .util .WordTokenizer ;
2022
2123import opennlp .tools .postag .POSModel ;
2224
@@ -99,49 +101,30 @@ private abstract static class CsvRowHandler
99101 }
100102
101103 /**
102- * Load a CSV reader into a {@link MweLexicon}.
104+ * Load one column of expressions
103105 *
104106 * @param lexicon
105- * @param csv CSV reader
106- * @param col column index containing the form to add
107- * @param skipHeader if {@code true}, the first row is skipped
107+ * @param anchor class used to resolve the resource path
108+ * @param resourcePath classpath resource path
108109 * @throws UncheckedIOException on read error
109- * @throws NullPointerException if {@code lexicon} or {@code csv} is null
110- * @throws IllegalArgumentException if {@code col < 0}
110+ * @throws NullPointerException if {@code lexicon}, {@code anchor}, or
111+ * {@code resourcePath} is null
111112 */
112113 public static void loadExpressions (
113114 final MweLexicon lexicon ,
114- final CSVReader csv ,
115- final int colExpression ,
116- final int colCanonical ,
117- final CsvHeader csvHeader )
115+ final WordTokenizer tokenizer ,
116+ final Path file
117+ )
118118 {
119- Objects .requireNonNull (lexicon , "lexicon" );
120- Objects .requireNonNull (csv , "csv" );
121- checkColumnIndex (colExpression , "colExpression" );
122- checkColumnIndex (colCanonical , "colCanonical" );
123-
124- final CsvRowHandler handler = new CsvRowHandler ()
125- {
126- @ Override
127- protected boolean accept (final CSVReader row ) throws UncheckedIOException
128- {
129- final StringBuilder expression = row .getCell (colExpression );
130- if (expression .length () == 0 ) return false ;
131- final StringBuilder canonical = row .getCell (colCanonical );
132- if (canonical .length () > 0 ) {
133- lexicon .addExpression (expression , canonical );
134- return true ;
135- }
136- lexicon .addExpression (expression );
137- return true ;
138- }
139- };
140-
141- forEachDataRow (csv , csvHeader , handler );
119+ Objects .requireNonNull (file , "file" );
120+ try (CSVReader csv = new CSVReader (file , ',' , 2 )) {
121+ loadExpressions (lexicon , tokenizer , csv , 0 , 1 , CsvHeader .SKIP );
122+ } catch (IOException e ) {
123+ throw new UncheckedIOException (e );
124+ }
142125 }
143126
144-
127+
145128 /**
146129 * Load one column of expressions
147130 *
@@ -154,44 +137,67 @@ protected boolean accept(final CSVReader row) throws UncheckedIOException
154137 */
155138 public static void loadExpressions (
156139 final MweLexicon lexicon ,
140+ final WordTokenizer tokenizer ,
157141 final Class <?> anchor ,
158142 final String resourcePath
159143 )
160144 {
161145 Objects .requireNonNull (anchor , "anchor" );
162146 Objects .requireNonNull (resourcePath , "resourcePath" );
163147 try (CSVReader csv = new CSVReader (anchor , resourcePath , ',' , 2 )) {
164- loadExpressions (lexicon , csv , 0 , 1 , CsvHeader .SKIP );
148+ loadExpressions (lexicon , tokenizer , csv , 0 , 1 , CsvHeader .SKIP );
165149 } catch (IOException e ) {
166150 throw new UncheckedIOException (e );
167151 }
168152 }
169153
170154 /**
171- * Load one column of expressions
155+ * Load a CSV reader into a {@link MweLexicon}.
172156 *
173157 * @param lexicon
174- * @param anchor class used to resolve the resource path
175- * @param resourcePath classpath resource path
158+ * @param csv CSV reader
159+ * @param col column index containing the form to add
160+ * @param skipHeader if {@code true}, the first row is skipped
176161 * @throws UncheckedIOException on read error
177- * @throws NullPointerException if {@code lexicon}, {@code anchor}, or
178- * {@code resourcePath} is null
162+ * @throws NullPointerException if {@code lexicon} or {@code csv} is null
163+ * @throws IllegalArgumentException if {@code col < 0}
179164 */
180165 public static void loadExpressions (
181166 final MweLexicon lexicon ,
182- final Path file
183- )
167+ final WordTokenizer tokenizer ,
168+ final CSVReader csv ,
169+ final int colExpression ,
170+ final int colCanonical ,
171+ final CsvHeader csvHeader )
184172 {
185- Objects .requireNonNull (file , "file" );
186- try (CSVReader csv = new CSVReader (file , ',' , 2 )) {
187- loadExpressions (lexicon , csv , 0 , 1 , CsvHeader .SKIP );
188- } catch (IOException e ) {
189- throw new UncheckedIOException (e );
190- }
173+ Objects .requireNonNull (lexicon , "lexicon" );
174+ Objects .requireNonNull (csv , "csv" );
175+ checkColumnIndex (colExpression , "colExpression" );
176+ checkColumnIndex (colCanonical , "colCanonical" );
177+
178+ final CsvRowHandler handler = new CsvRowHandler ()
179+ {
180+ @ Override
181+ protected boolean accept (final CSVReader row ) throws UncheckedIOException
182+ {
183+ final StringBuilder expression = row .getCell (colExpression );
184+ if (expression .length () == 0 ) return false ;
185+ final StringBuilder canonical = row .getCell (colCanonical );
186+ List <String > words = tokenizer .tokenize (expression );
187+
188+ if (canonical .length () > 0 ) {
189+ lexicon .addExpression (words , canonical );
190+ } else {
191+ lexicon .addExpression (words , expression );
192+ }
193+ return true ;
194+ }
195+ };
196+
197+ forEachDataRow (csv , csvHeader , handler );
191198 }
192-
193199
194-
200+
195201 /**
196202 * Load a 2-column CSV reader into a {@link CharArrayMap}.
197203 * <p>
0 commit comments