Skip to content

Commit 32436db

Browse files
committed
MweLexicon ready for load
1 parent 3d4db0d commit 32436db

7 files changed

Lines changed: 319 additions & 279 deletions

File tree

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/LexiconHelper.java

Lines changed: 156 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,158 @@ private abstract static class CsvRowHandler
9797
protected abstract boolean accept(CSVReader csv) throws UncheckedIOException;
9898
}
9999

100+
/**
101+
* Load a CSV reader into a {@link MweLexicon}.
102+
*
103+
* @param lexicon
104+
* @param csv CSV reader
105+
* @param col column index containing the form to add
106+
* @param skipHeader if {@code true}, the first row is skipped
107+
* @throws UncheckedIOException on read error
108+
* @throws NullPointerException if {@code lexicon} or {@code csv} is null
109+
* @throws IllegalArgumentException if {@code col < 0}
110+
*/
111+
public static void loadExpressions(
112+
final MweLexicon lexicon,
113+
final CSVReader csv,
114+
final int col,
115+
final CsvHeader csvHeader)
116+
{
117+
Objects.requireNonNull(lexicon, "lexicon");
118+
Objects.requireNonNull(csv, "csv");
119+
checkColumnIndex(col, "col");
120+
121+
final int minCols = col + 1;
122+
final int keyCol = col;
123+
final CsvRowHandler handler = new CsvRowHandler()
124+
{
125+
@Override
126+
protected boolean accept(final CSVReader row) throws UncheckedIOException
127+
{
128+
final StringBuilder form = row.getCell(col);
129+
if (form.length() == 0) return false;
130+
lexicon.addExpression(form);
131+
return true;
132+
}
133+
};
134+
135+
forEachDataRow(csv, minCols, keyCol, csvHeader, handler);
136+
}
137+
138+
139+
/**
140+
* Load one column of expressions
141+
*
142+
* @param lexicon
143+
* @param anchor class used to resolve the resource path
144+
* @param resourcePath classpath resource path
145+
* @throws UncheckedIOException on read error
146+
* @throws NullPointerException if {@code lexicon}, {@code anchor}, or
147+
* {@code resourcePath} is null
148+
*/
149+
public static void loadExpressions(
150+
final MweLexicon lexicon,
151+
final Class<?> anchor,
152+
final String resourcePath
153+
)
154+
{
155+
Objects.requireNonNull(anchor, "anchor");
156+
Objects.requireNonNull(resourcePath, "resourcePath");
157+
try (CSVReader csv = new CSVReader(anchor, resourcePath, ',', 1)) {
158+
loadExpressions(lexicon, csv, 0, CsvHeader.SKIP);
159+
} catch (IOException e) {
160+
throw new UncheckedIOException(e);
161+
}
162+
}
163+
164+
/**
165+
* Load one column of expressions
166+
*
167+
* @param lexicon
168+
* @param anchor class used to resolve the resource path
169+
* @param resourcePath classpath resource path
170+
* @throws UncheckedIOException on read error
171+
* @throws NullPointerException if {@code lexicon}, {@code anchor}, or
172+
* {@code resourcePath} is null
173+
*/
174+
public static void loadExpressions(
175+
final MweLexicon lexicon,
176+
final Path file
177+
)
178+
{
179+
Objects.requireNonNull(file, "file");
180+
try (CSVReader csv = new CSVReader(file, ',', 1)) {
181+
loadExpressions(lexicon, csv, 0, CsvHeader.SKIP);
182+
} catch (IOException e) {
183+
throw new UncheckedIOException(e);
184+
}
185+
}
186+
187+
188+
189+
/**
190+
* Load a 2-column CSV reader into a {@link CharArrayMap}.
191+
* <p>
192+
* Column 0 = key, column 1 = value.
193+
* </p>
194+
*
195+
* @param map target map (key -&gt; char[] value)
196+
* @param csv CSV reader
197+
* @param replace if {@code true}, overwrite existing keys; otherwise
198+
* keep existing entries
199+
* @param skipHeader if {@code true}, the first row is skipped
200+
* @throws UncheckedIOException on read error
201+
* @throws NullPointerException if {@code map} or {@code csv} is null
202+
*/
203+
public static void loadMap(
204+
final CharArrayMap<char[]> map,
205+
final CSVReader csv,
206+
final OnDuplicate policy,
207+
final CsvHeader csvHeader,
208+
final int keyCol,
209+
final int valueCol,
210+
Report report
211+
)
212+
throws UncheckedIOException
213+
{
214+
Objects.requireNonNull(map, "map");
215+
Objects.requireNonNull(csv, "csv");
216+
217+
final int minCols = Math.max(keyCol, valueCol) + 1;
218+
219+
final CsvRowHandler handler = new CsvRowHandler()
220+
{
221+
@Override
222+
protected boolean accept(final CSVReader row) throws UncheckedIOException
223+
{
224+
final StringBuilder key = row.getCell(keyCol);
225+
if (map.containsKey(key)) {
226+
if (policy == OnDuplicate.IGNORE) {
227+
return false;
228+
}
229+
if (policy == OnDuplicate.REPLACE) {
230+
map.put(key, row.getCellToCharArray(valueCol));
231+
return true;
232+
}
233+
String msg = "LexiconHelper.loadMap " + row.getSpec() + "#l" + row.getRowNo()
234+
+ " duplicate key=" + key + " oldValue=" + new String(map.get(key)) + " newValue=" + row.getCell(valueCol);
235+
if (policy == OnDuplicate.ERROR) {
236+
throw new RuntimeException(msg);
237+
}
238+
if (report != null && policy == OnDuplicate.REPORT) {
239+
report.warn(msg);
240+
return false;
241+
}
242+
}
243+
map.put(key, row.getCellToCharArray(valueCol));
244+
return true;
245+
}
246+
};
247+
248+
forEachDataRow(csv, minCols, keyCol, csvHeader, handler);
249+
}
250+
251+
100252
/**
101253
* Load a 2-column CSV resource into a {@link CharArrayMap} from a classpath
102254
* resource.
@@ -188,68 +340,6 @@ public static void loadMap(
188340
}
189341

190342

191-
/**
192-
* Load a 2-column CSV reader into a {@link CharArrayMap}.
193-
* <p>
194-
* Column 0 = key, column 1 = value.
195-
* </p>
196-
*
197-
* @param map target map (key -&gt; char[] value)
198-
* @param csv CSV reader
199-
* @param replace if {@code true}, overwrite existing keys; otherwise
200-
* keep existing entries
201-
* @param skipHeader if {@code true}, the first row is skipped
202-
* @throws UncheckedIOException on read error
203-
* @throws NullPointerException if {@code map} or {@code csv} is null
204-
*/
205-
public static void loadMap(
206-
final CharArrayMap<char[]> map,
207-
final CSVReader csv,
208-
final OnDuplicate policy,
209-
final CsvHeader csvHeader,
210-
final int keyCol,
211-
final int valueCol,
212-
Report report
213-
)
214-
throws UncheckedIOException
215-
{
216-
Objects.requireNonNull(map, "map");
217-
Objects.requireNonNull(csv, "csv");
218-
219-
final int minCols = Math.max(keyCol, valueCol) + 1;
220-
221-
final CsvRowHandler handler = new CsvRowHandler()
222-
{
223-
@Override
224-
protected boolean accept(final CSVReader row) throws UncheckedIOException
225-
{
226-
final StringBuilder key = row.getCell(keyCol);
227-
if (map.containsKey(key)) {
228-
if (policy == OnDuplicate.IGNORE) {
229-
return false;
230-
}
231-
if (policy == OnDuplicate.REPLACE) {
232-
map.put(key, row.getCellToCharArray(valueCol));
233-
return true;
234-
}
235-
String msg = "LexiconHelper.loadMap " + row.getSpec() + "#l" + row.getRowNo()
236-
+ " duplicate key=" + key + " oldValue=" + new String(map.get(key)) + " newValue=" + row.getCell(valueCol);
237-
if (policy == OnDuplicate.ERROR) {
238-
throw new RuntimeException(msg);
239-
}
240-
if (report != null && policy == OnDuplicate.REPORT) {
241-
report.warn(msg);
242-
return false;
243-
}
244-
}
245-
map.put(key, row.getCellToCharArray(valueCol));
246-
return true;
247-
}
248-
};
249-
250-
forEachDataRow(csv, minCols, keyCol, csvHeader, handler);
251-
}
252-
253343
/**
254344
* Load a CSV resource into a {@link CharArraySet} from a classpath
255345
* resource.
@@ -270,7 +360,6 @@ public static void loadSet(
270360
final CharArraySet set,
271361
final Class<?> anchor,
272362
final String resourcePath)
273-
throws UncheckedIOException
274363
{
275364
loadSet(set, anchor, resourcePath, 0, CsvHeader.SKIP, null);
276365
}
@@ -298,7 +387,6 @@ public static void loadSet(
298387
final int col,
299388
final CsvHeader csvHeader,
300389
final String rtrimChars)
301-
throws UncheckedIOException
302390
{
303391
Objects.requireNonNull(anchor, "anchor");
304392
Objects.requireNonNull(resourcePath, "resourcePath");
@@ -606,10 +694,10 @@ private static void forEachDataRow(
606694
if (csv.getCellCount() < minCols) {
607695
continue;
608696
}
609-
610-
final StringBuilder key = csv.getCell(keyCol);
611-
if (isBlankOrComment(key))
612-
continue;
697+
// check if first col is not a comment
698+
if (csv.getCell(0).length() > 0 && csv.getCell(0).charAt(0) == '#') continue;
699+
// check if has at a “key” value
700+
if(csv.getCell(keyCol).length() == 0) continue;
613701

614702
handler.accept(csv);
615703
}
@@ -618,17 +706,6 @@ private static void forEachDataRow(
618706
}
619707
}
620708

621-
/**
622-
* Return whether a cell is blank or starts with {@code '#'} (comment
623-
* marker).
624-
*
625-
* @param cell CSV cell content
626-
* @return {@code true} if null, empty, or comment
627-
*/
628-
private static boolean isBlankOrComment(final CharSequence cell)
629-
{
630-
return cell == null || cell.length() == 0 || cell.charAt(0) == '#';
631-
}
632709

633710
/**
634711
* Compute the minimum required column count from zero-based column indices.

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/MweFilter.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,7 @@ private void emitMerged(final int matchPos, final int matchOrd)
181181
// Restore all attributes from first component (posIncr, startOffset, flags, ...).
182182
queue.restoreTo(this, 0);
183183

184-
// Override term with canonical form — direct char copy, no String allocation.
185-
lexicon.fillTerm(matchOrd, termAtt);
184+
lexicon.formToAttribute(matchOrd, termAtt);
186185

187186
// Fix endOffset and type.
188187
offsetAtt.setOffset(offsetAtt.startOffset(), endOffset);

0 commit comments

Comments
 (0)