|
1 | 1 | package com.github.oeuvres.alix.lucene.analysis; |
2 | 2 |
|
| 3 | +import java.io.IOException; |
| 4 | + |
| 5 | +import org.apache.lucene.analysis.Analyzer; |
| 6 | +import org.apache.lucene.analysis.TokenStream; |
| 7 | +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| 8 | + |
3 | 9 | import com.github.oeuvres.alix.util.CharsDic; |
4 | 10 | import com.github.oeuvres.alix.util.IntAutomaton; |
5 | 11 |
|
6 | 12 | /** |
7 | | - * Immutable, compiled lexicon of contiguous Multi-Word Expressions (MWEs) for {@link MweFilter}. |
| 13 | + * Incremental lexicon of multi-word expressions (MWEs), backed by a single |
| 14 | + * {@link CharsDic} and an {@link IntAutomaton}. |
8 | 15 | * |
9 | | - * <h2>Concept</h2> |
10 | | - * <ul> |
11 | | - * <li>Matching is performed over integer token identifiers (typically lemma ids) using an {@link IntAutomaton}.</li> |
12 | | - * <li>Accepting automaton states yield an {@code entryId} (0-based), which indexes per-entry metadata: |
13 | | - * output form (chars), POS tag id, optional lemma id.</li> |
14 | | - * <li>Output strings are stored in a {@link CharsDic} and copied into a caller-provided scratch buffer |
15 | | - * to keep runtime allocation-free.</li> |
16 | | - * </ul> |
| 16 | + * <p>The {@link CharsDic} holds both component tokens (automaton arc labels, by ordinal) |
| 17 | + * and canonical MWE forms (automaton accept values, also ordinals). A caller that receives |
| 18 | + * an accept ordinal can retrieve the canonical char data from {@link #vocab()} without |
| 19 | + * allocation.</p> |
17 | 20 | * |
18 | | - * <h2>Thread-safety</h2> |
19 | | - * <p>This object is immutable and safe to share across analyzers/threads once constructed.</p> |
| 21 | + * <p>Lifecycle:</p> |
| 22 | + * <ol> |
| 23 | + * <li>Construct with the {@link Analyzer} that matches the index-time pipeline.</li> |
| 24 | + * <li>Call {@link #addExpression(CharSequence)} for each canonical MWE string.</li> |
| 25 | + * <li>Call {@link #freeze()} once; further {@link #addExpression} calls throw.</li> |
| 26 | + * <li>Use {@link #step(int, char[], int)} and {@link #accept(int)} in the token filter.</li> |
| 27 | + * </ol> |
20 | 28 | * |
21 | | - * <h2>Contract used by {@link MweFilter}</h2> |
22 | | - * <ul> |
23 | | - * <li>{@link #root()}, {@link #step(int, int)}, {@link #acceptEntry(int)} for traversal</li> |
24 | | - * <li>{@link #maxPatternTokens()} for bounded lookahead</li> |
25 | | - * <li>{@link #copyOutput(int, char[])} and {@link #maxOutputLen()} for term emission</li> |
26 | | - * <li>{@link #pos(int)} and {@link #lemmaId(int)} for compound token attributes</li> |
27 | | - * </ul> |
| 29 | + * <p>Thread-safety: not thread-safe during building; immutable and safe for concurrent |
| 30 | + * read after {@link #freeze()}.</p> |
28 | 31 | */ |
29 | 32 | public final class MweLexicon |
30 | 33 | { |
31 | | - private final IntAutomaton automaton; |
32 | | - |
33 | | - /** Pool of output terms (canonical surface or canonical lemma form, depending on loader policy). */ |
34 | | - private final CharsDic outputDic; |
35 | | - |
36 | | - /** entryId -> output ordinal in {@link #outputDic}. */ |
37 | | - private final int[] outputOrd; |
| 34 | + /** Shared vocabulary: component tokens and canonical forms, identified by ordinal. */ |
| 35 | + private final CharsDic vocab; |
38 | 36 |
|
39 | | - /** entryId -> POS tag id (tagset is caller-defined). */ |
40 | | - private final short[] pos; |
| 37 | + /** Automaton over component-token ordinal sequences; accept value = canonical form ordinal. */ |
| 38 | + private final IntAutomaton auto; |
41 | 39 |
|
42 | | - /** entryId -> lemma id for the compound token, or -1 if not defined. */ |
43 | | - private final int[] lemmaId; |
| 40 | + /** Analyzer used to split canonical forms into component tokens. */ |
| 41 | + private final Analyzer analyzer; |
44 | 42 |
|
45 | | - /** Max number of tokens in any pattern (lookahead bound). */ |
46 | | - private final int maxPatternTokens; |
| 43 | + /** Field name passed to the analyzer (may be a dummy value). */ |
| 44 | + private final String fieldName; |
47 | 45 |
|
48 | | - /** Max output length in UTF-16 code units among all entries (for scratch sizing). */ |
49 | | - private final int maxOutputLen; |
| 46 | + /** Reusable buffer for token-id sequences during addExpression. */ |
| 47 | + private int[] idsBuf; |
50 | 48 |
|
51 | 49 | /** |
52 | | - * Build a lexicon from already-compiled components. |
| 50 | + * Constructs an empty, mutable lexicon. |
53 | 51 | * |
54 | | - * <p>All arrays are defensively checked for length consistency; they are not copied. |
55 | | - * Treat passed arrays as frozen after construction.</p> |
56 | | - * |
57 | | - * @param automaton packed automaton; its accept ids must be entryIds in [0..entryCount) or -1 |
58 | | - * @param outputDic output term pool |
59 | | - * @param outputOrd entryId -> output ordinal |
60 | | - * @param pos entryId -> POS tag id |
61 | | - * @param lemmaId entryId -> lemma id or -1 |
62 | | - * @throws NullPointerException if any argument is null |
63 | | - * @throws IllegalArgumentException if array lengths disagree |
64 | | - */ |
65 | | - public MweLexicon( |
66 | | - final IntAutomaton automaton, |
67 | | - final CharsDic outputDic, |
68 | | - final int[] outputOrd, |
69 | | - final short[] pos, |
70 | | - final int[] lemmaId |
71 | | - ) { |
72 | | - if (automaton == null) throw new NullPointerException("automaton"); |
73 | | - if (outputDic == null) throw new NullPointerException("outputDic"); |
74 | | - if (outputOrd == null) throw new NullPointerException("outputOrd"); |
75 | | - if (pos == null) throw new NullPointerException("pos"); |
76 | | - if (lemmaId == null) throw new NullPointerException("lemmaId"); |
77 | | - |
78 | | - final int n = outputOrd.length; |
79 | | - if (pos.length != n) throw new IllegalArgumentException("pos.length != outputOrd.length"); |
80 | | - if (lemmaId.length != n) throw new IllegalArgumentException("lemmaId.length != outputOrd.length"); |
81 | | - |
82 | | - this.automaton = automaton; |
83 | | - this.outputDic = outputDic; |
84 | | - this.outputOrd = outputOrd; |
85 | | - this.pos = pos; |
86 | | - this.lemmaId = lemmaId; |
87 | | - |
88 | | - this.maxPatternTokens = Math.max(1, automaton.maxLen()); |
89 | | - |
90 | | - // Prefer O(1) if you implement it in CharsDic; otherwise compute once here. |
91 | | - int mol = 0; |
92 | | - // If your CharsDic exposes maxTermLength(), use it: |
93 | | - // mol = outputDic.maxTermLength(); |
94 | | - // Otherwise, compute from entry ords: |
95 | | - for (int i = 0; i < n; i++) { |
96 | | - final int ord = outputOrd[i]; |
97 | | - final int len = outputDic.termLength(ord); |
98 | | - if (len > mol) mol = len; |
99 | | - } |
100 | | - this.maxOutputLen = mol; |
101 | | - |
102 | | - // Optional: validate accept ids are within bounds (cheap enough for debug builds). |
103 | | - // validateAcceptIds(n); |
104 | | - } |
105 | | - |
106 | | - /** |
107 | | - * Root automaton state (always 0). |
| 52 | + * @param analyzer analyzer whose tokenization matches the index-time pipeline |
| 53 | + * @param fieldName field name passed to the analyzer |
| 54 | + * @param expectedSize estimate of the number of MWEs; used only for initial sizing |
108 | 55 | */ |
109 | | - public int root() |
| 56 | + public MweLexicon(final Analyzer analyzer, final String fieldName, final int expectedSize) |
110 | 57 | { |
111 | | - return automaton.root(); |
| 58 | + if (analyzer == null) throw new IllegalArgumentException("analyzer"); |
| 59 | + if (fieldName == null) throw new IllegalArgumentException("fieldName"); |
| 60 | + this.analyzer = analyzer; |
| 61 | + this.fieldName = fieldName; |
| 62 | + this.vocab = new CharsDic(Math.max(8, expectedSize * 3)); |
| 63 | + this.auto = new IntAutomaton(); |
| 64 | + this.idsBuf = new int[8]; |
112 | 65 | } |
113 | 66 |
|
114 | 67 | /** |
115 | | - * Transition function: follow an arc labeled {@code tokenId} from {@code state}. |
116 | | - * |
117 | | - * @return next state id, or -1 if no transition exists |
| 68 | + * Returns the {@link CharsDic} ordinal of the canonical form if {@code state} is |
| 69 | + * accepting, or -1 if non-accepting. |
| 70 | + * Pass the result to {@link #vocab()} to retrieve char data without allocation. |
118 | 71 | */ |
119 | | - public int step(final int state, final int tokenId) |
| 72 | + public int accept(final int state) |
120 | 73 | { |
121 | | - return automaton.step(state, tokenId); |
| 74 | + return auto.accept(state); |
122 | 75 | } |
123 | 76 |
|
124 | 77 | /** |
125 | | - * Returns an {@code entryId} if {@code state} is accepting, or -1 otherwise. |
| 78 | + * Tokenizes {@code expression} with the analyzer, registers each component token |
| 79 | + * in the vocabulary, and adds the token-id sequence to the automaton with the |
| 80 | + * canonical form ordinal as accept value. |
126 | 81 | * |
127 | | - * <p>The returned value indexes {@link #pos(int)}, {@link #lemmaId(int)}, and {@link #copyOutput(int, char[])}.</p> |
128 | | - */ |
129 | | - public int acceptEntry(final int state) |
130 | | - { |
131 | | - return automaton.accept(state); |
132 | | - } |
133 | | - |
134 | | - /** |
135 | | - * Maximum number of tokens in any MWE pattern. |
| 82 | + * <p>Expressions that yield fewer than two tokens are silently skipped. |
| 83 | + * If the same token sequence is added more than once, the last canonical form wins.</p> |
136 | 84 | * |
137 | | - * <p>This is the correct lookahead bound for longest-match algorithms in a linear TokenStream.</p> |
| 85 | + * @param expression canonical MWE string (e.g. {@code "machine learning"}) |
| 86 | + * @throws IOException if the analyzer throws during tokenization |
| 87 | + * @throws IllegalStateException if {@link #freeze()} has already been called |
138 | 88 | */ |
139 | | - public int maxPatternTokens() |
| 89 | + public void addExpression(final CharSequence expression) throws IOException |
140 | 90 | { |
141 | | - return maxPatternTokens; |
142 | | - } |
| 91 | + if (idsBuf == null) throw new IllegalStateException("frozen"); |
| 92 | + if (expression == null || expression.length() == 0) return; |
| 93 | + |
| 94 | + // Tokenize first: only register in vocab if the sequence is a valid MWE (>= 2 tokens). |
| 95 | + int len = 0; |
| 96 | + try (TokenStream ts = analyzer.tokenStream(fieldName, expression.toString())) { |
| 97 | + final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); |
| 98 | + ts.reset(); |
| 99 | + while (ts.incrementToken()) { |
| 100 | + // CharTermAttribute implements CharSequence: no toString() copy needed. |
| 101 | + final int tokOrd = ord(vocab.add(termAtt)); |
| 102 | + if (len == idsBuf.length) idsBuf = java.util.Arrays.copyOf(idsBuf, len * 2); |
| 103 | + idsBuf[len++] = tokOrd; |
| 104 | + } |
| 105 | + ts.end(); |
| 106 | + } |
143 | 107 |
|
144 | | - /** |
145 | | - * Backward-compatible alias for {@link #maxPatternTokens()}. |
146 | | - * |
147 | | - * <p>Avoid exposing "len" without unit; prefer {@link #maxPatternTokens()} in new code.</p> |
148 | | - */ |
149 | | - public int maxLen() |
150 | | - { |
151 | | - return maxPatternTokens; |
152 | | - } |
| 108 | + if (len < 2) return; |
153 | 109 |
|
154 | | - /** |
155 | | - * Number of MWE entries in this lexicon. |
156 | | - */ |
157 | | - public int size() |
158 | | - { |
159 | | - return outputOrd.length; |
| 110 | + // Register canonical form only after confirming the expression is valid. |
| 111 | + final int formOrd = ord(vocab.add(expression)); |
| 112 | + auto.add(idsBuf, len, formOrd); |
160 | 113 | } |
161 | 114 |
|
162 | 115 | /** |
163 | | - * POS tag id for the compound token produced by this entry. |
164 | | - * |
165 | | - * @param entryId entry id returned by {@link #acceptEntry(int)} |
| 116 | + * Freezes the vocabulary and packs the automaton into primitive arrays. |
| 117 | + * Must be called before any runtime method. Idempotent. |
166 | 118 | */ |
167 | | - public short pos(final int entryId) |
| 119 | + public void freeze() |
168 | 120 | { |
169 | | - checkEntry(entryId); |
170 | | - return pos[entryId]; |
| 121 | + vocab.trimToSize(); |
| 122 | + auto.freeze(false); |
| 123 | + idsBuf = null; |
171 | 124 | } |
172 | 125 |
|
173 | 126 | /** |
174 | | - * Optional lemma id for the compound token produced by this entry. |
175 | | - * |
176 | | - * @param entryId entry id returned by {@link #acceptEntry(int)} |
177 | | - * @return lemma id, or -1 if none is defined |
| 127 | + * Upper bound on MWE length in tokens. |
| 128 | + * Use to size the token filter's lookahead buffer ({@code maxLen() + 1}). |
178 | 129 | */ |
179 | | - public int lemmaId(final int entryId) |
| 130 | + public int maxLen() |
180 | 131 | { |
181 | | - checkEntry(entryId); |
182 | | - return lemmaId[entryId]; |
| 132 | + return auto.maxLen(); |
183 | 133 | } |
184 | 134 |
|
185 | | - /** |
186 | | - * Copies the output term for {@code entryId} into {@code dst[0..len)} and returns {@code len}. |
187 | | - * |
188 | | - * <p>This is the hot-path method used by {@link MweFilter} to emit the canonical MWE term text |
189 | | - * without allocating.</p> |
190 | | - * |
191 | | - * <p>Caller responsibility: ensure {@code dst.length >= maxOutputLen()}.</p> |
192 | | - * |
193 | | - * @param entryId entry id returned by {@link #acceptEntry(int)} |
194 | | - * @param dst destination buffer (start at index 0) |
195 | | - * @return length in UTF-16 code units |
196 | | - * @throws IllegalArgumentException if {@code entryId} is invalid or {@code dst} too small |
197 | | - * @throws NullPointerException if {@code dst} is null |
198 | | - */ |
199 | | - public int copyOutput(final int entryId, final char[] dst) |
| 135 | + /** Root state; pass as the initial state to the first {@link #step} call per position. */ |
| 136 | + public int root() |
200 | 137 | { |
201 | | - checkEntry(entryId); |
202 | | - if (dst == null) throw new NullPointerException("dst"); |
203 | | - |
204 | | - final int ord = outputOrd[entryId]; |
205 | | - |
206 | | - // Preferred if you add the safe copy-out API to CharsDic: |
207 | | - // return outputDic.get(ord, dst); |
208 | | - |
209 | | - // Fallback using current CharsDic API (slab + offsets) while keeping it encapsulated here: |
210 | | - final int len = outputDic.termLength(ord); |
211 | | - if (dst.length < len) { |
212 | | - throw new IllegalArgumentException("dst too small: dst.length=" + dst.length + " need=" + len); |
213 | | - } |
214 | | - final int off = outputDic.termOffset(ord); |
215 | | - System.arraycopy(outputDic.slab(), off, dst, 0, len); |
216 | | - return len; |
| 138 | + return auto.root(); |
217 | 139 | } |
218 | 140 |
|
219 | 141 | /** |
220 | | - * Maximum output term length among all entries, in UTF-16 code units. |
| 142 | + * Advances the automaton by one token. |
221 | 143 | * |
222 | | - * <p>Use this value to allocate a reusable scratch buffer for {@link #copyOutput(int, char[])}.</p> |
223 | | - */ |
224 | | - public int maxOutputLen() |
225 | | - { |
226 | | - return maxOutputLen; |
227 | | - } |
228 | | - |
229 | | - /** |
230 | | - * Debug helper: returns the output term as a String (allocates). |
| 144 | + * <p>Tokens absent from the vocabulary return -1 immediately without touching |
| 145 | + * the automaton — fast-fail for the common case.</p> |
231 | 146 | * |
232 | | - * @param entryId entry id returned by {@link #acceptEntry(int)} |
| 147 | + * @param state current automaton state |
| 148 | + * @param buf token character buffer (e.g. {@link CharTermAttribute#buffer()}) |
| 149 | + * @param len number of valid chars in {@code buf} |
| 150 | + * @return next state, or -1 if no transition exists |
233 | 151 | */ |
234 | | - public String outputAsString(final int entryId) |
| 152 | + public int step(final int state, final char[] buf, final int len) |
235 | 153 | { |
236 | | - checkEntry(entryId); |
237 | | - final int ord = outputOrd[entryId]; |
238 | | - final int off = outputDic.termOffset(ord); |
239 | | - final int len = outputDic.termLength(ord); |
240 | | - return new String(outputDic.slab(), off, len); |
| 154 | + if (idsBuf != null) throw new IllegalStateException("not frozen"); |
| 155 | + final int tokOrd = vocab.find(buf, 0, len); |
| 156 | + if (tokOrd < 0) return -1; |
| 157 | + return auto.step(state, tokOrd); |
241 | 158 | } |
242 | 159 |
|
243 | 160 | /** |
244 | | - * Exposes the internal automaton (advanced use). |
| 161 | + * Shared vocabulary; use ordinals from {@link #accept(int)} to retrieve canonical forms. |
245 | 162 | */ |
246 | | - public IntAutomaton automaton() |
| 163 | + public CharsDic vocab() |
247 | 164 | { |
248 | | - return automaton; |
249 | | - } |
250 | | - |
251 | | - /** |
252 | | - * Exposes the output dictionary (advanced use; immutable by convention). |
253 | | - */ |
254 | | - public CharsDic outputDic() |
255 | | - { |
256 | | - return outputDic; |
257 | | - } |
258 | | - |
259 | | - private void checkEntry(final int entryId) |
260 | | - { |
261 | | - if (entryId < 0 || entryId >= outputOrd.length) { |
262 | | - throw new IllegalArgumentException("bad entryId " + entryId); |
263 | | - } |
| 165 | + return vocab; |
264 | 166 | } |
265 | 167 |
|
266 | | - @SuppressWarnings("unused") |
267 | | - private void validateAcceptIds(final int entryCount) |
| 168 | + /** Recovers a {@link CharsDic} ordinal from the raw return value of {@code add()}. */ |
| 169 | + private static int ord(final int raw) |
268 | 170 | { |
269 | | - // IntAutomaton stores accept per state; we can only validate by scanning all states if you expose state count. |
270 | | - // If you later add IntAutomaton.stateCount(), validate accept ids here. |
| 171 | + return raw >= 0 ? raw : -raw - 1; |
271 | 172 | } |
272 | 173 | } |
0 commit comments