Skip to content

Commit 35f070d

Browse files
committed
Seems to work
1 parent 27d5b9d commit 35f070d

4 files changed

Lines changed: 382 additions & 260 deletions

File tree

Lines changed: 108 additions & 207 deletions
Original file line numberDiff line numberDiff line change
@@ -1,272 +1,173 @@
11
package com.github.oeuvres.alix.lucene.analysis;
22

3+
import java.io.IOException;
4+
5+
import org.apache.lucene.analysis.Analyzer;
6+
import org.apache.lucene.analysis.TokenStream;
7+
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
8+
39
import com.github.oeuvres.alix.util.CharsDic;
410
import com.github.oeuvres.alix.util.IntAutomaton;
511

612
/**
7-
* Immutable, compiled lexicon of contiguous Multi-Word Expressions (MWEs) for {@link MweFilter}.
13+
* Incremental lexicon of multi-word expressions (MWEs), backed by a single
14+
* {@link CharsDic} and an {@link IntAutomaton}.
815
*
9-
* <h2>Concept</h2>
10-
* <ul>
11-
* <li>Matching is performed over integer token identifiers (typically lemma ids) using an {@link IntAutomaton}.</li>
12-
* <li>Accepting automaton states yield an {@code entryId} (0-based), which indexes per-entry metadata:
13-
* output form (chars), POS tag id, optional lemma id.</li>
14-
* <li>Output strings are stored in a {@link CharsDic} and copied into a caller-provided scratch buffer
15-
* to keep runtime allocation-free.</li>
16-
* </ul>
16+
* <p>The {@link CharsDic} holds both component tokens (automaton arc labels, by ordinal)
17+
* and canonical MWE forms (automaton accept values, also ordinals). A caller that receives
18+
* an accept ordinal can retrieve the canonical char data from {@link #vocab()} without
19+
* allocation.</p>
1720
*
18-
* <h2>Thread-safety</h2>
19-
* <p>This object is immutable and safe to share across analyzers/threads once constructed.</p>
21+
* <p>Lifecycle:</p>
22+
* <ol>
23+
* <li>Construct with the {@link Analyzer} that matches the index-time pipeline.</li>
24+
* <li>Call {@link #addExpression(CharSequence)} for each canonical MWE string.</li>
25+
* <li>Call {@link #freeze()} once; further {@link #addExpression} calls throw.</li>
26+
* <li>Use {@link #step(int, char[], int)} and {@link #accept(int)} in the token filter.</li>
27+
* </ol>
2028
*
21-
* <h2>Contract used by {@link MweFilter}</h2>
22-
* <ul>
23-
* <li>{@link #root()}, {@link #step(int, int)}, {@link #acceptEntry(int)} for traversal</li>
24-
* <li>{@link #maxPatternTokens()} for bounded lookahead</li>
25-
* <li>{@link #copyOutput(int, char[])} and {@link #maxOutputLen()} for term emission</li>
26-
* <li>{@link #pos(int)} and {@link #lemmaId(int)} for compound token attributes</li>
27-
* </ul>
29+
* <p>Thread-safety: not thread-safe during building; immutable and safe for concurrent
30+
* read after {@link #freeze()}.</p>
2831
*/
2932
public final class MweLexicon
3033
{
31-
private final IntAutomaton automaton;
32-
33-
/** Pool of output terms (canonical surface or canonical lemma form, depending on loader policy). */
34-
private final CharsDic outputDic;
35-
36-
/** entryId -> output ordinal in {@link #outputDic}. */
37-
private final int[] outputOrd;
34+
/** Shared vocabulary: component tokens and canonical forms, identified by ordinal. */
35+
private final CharsDic vocab;
3836

39-
/** entryId -> POS tag id (tagset is caller-defined). */
40-
private final short[] pos;
37+
/** Automaton over component-token ordinal sequences; accept value = canonical form ordinal. */
38+
private final IntAutomaton auto;
4139

42-
/** entryId -> lemma id for the compound token, or -1 if not defined. */
43-
private final int[] lemmaId;
40+
/** Analyzer used to split canonical forms into component tokens. */
41+
private final Analyzer analyzer;
4442

45-
/** Max number of tokens in any pattern (lookahead bound). */
46-
private final int maxPatternTokens;
43+
/** Field name passed to the analyzer (may be a dummy value). */
44+
private final String fieldName;
4745

48-
/** Max output length in UTF-16 code units among all entries (for scratch sizing). */
49-
private final int maxOutputLen;
46+
/** Reusable buffer for token-id sequences during addExpression. */
47+
private int[] idsBuf;
5048

5149
/**
52-
* Build a lexicon from already-compiled components.
50+
* Constructs an empty, mutable lexicon.
5351
*
54-
* <p>All arrays are defensively checked for length consistency; they are not copied.
55-
* Treat passed arrays as frozen after construction.</p>
56-
*
57-
* @param automaton packed automaton; its accept ids must be entryIds in [0..entryCount) or -1
58-
* @param outputDic output term pool
59-
* @param outputOrd entryId -> output ordinal
60-
* @param pos entryId -> POS tag id
61-
* @param lemmaId entryId -> lemma id or -1
62-
* @throws NullPointerException if any argument is null
63-
* @throws IllegalArgumentException if array lengths disagree
64-
*/
65-
public MweLexicon(
66-
final IntAutomaton automaton,
67-
final CharsDic outputDic,
68-
final int[] outputOrd,
69-
final short[] pos,
70-
final int[] lemmaId
71-
) {
72-
if (automaton == null) throw new NullPointerException("automaton");
73-
if (outputDic == null) throw new NullPointerException("outputDic");
74-
if (outputOrd == null) throw new NullPointerException("outputOrd");
75-
if (pos == null) throw new NullPointerException("pos");
76-
if (lemmaId == null) throw new NullPointerException("lemmaId");
77-
78-
final int n = outputOrd.length;
79-
if (pos.length != n) throw new IllegalArgumentException("pos.length != outputOrd.length");
80-
if (lemmaId.length != n) throw new IllegalArgumentException("lemmaId.length != outputOrd.length");
81-
82-
this.automaton = automaton;
83-
this.outputDic = outputDic;
84-
this.outputOrd = outputOrd;
85-
this.pos = pos;
86-
this.lemmaId = lemmaId;
87-
88-
this.maxPatternTokens = Math.max(1, automaton.maxLen());
89-
90-
// Prefer O(1) if you implement it in CharsDic; otherwise compute once here.
91-
int mol = 0;
92-
// If your CharsDic exposes maxTermLength(), use it:
93-
// mol = outputDic.maxTermLength();
94-
// Otherwise, compute from entry ords:
95-
for (int i = 0; i < n; i++) {
96-
final int ord = outputOrd[i];
97-
final int len = outputDic.termLength(ord);
98-
if (len > mol) mol = len;
99-
}
100-
this.maxOutputLen = mol;
101-
102-
// Optional: validate accept ids are within bounds (cheap enough for debug builds).
103-
// validateAcceptIds(n);
104-
}
105-
106-
/**
107-
* Root automaton state (always 0).
52+
* @param analyzer analyzer whose tokenization matches the index-time pipeline
53+
* @param fieldName field name passed to the analyzer
54+
* @param expectedSize estimate of the number of MWEs; used only for initial sizing
10855
*/
109-
public int root()
56+
public MweLexicon(final Analyzer analyzer, final String fieldName, final int expectedSize)
11057
{
111-
return automaton.root();
58+
if (analyzer == null) throw new IllegalArgumentException("analyzer");
59+
if (fieldName == null) throw new IllegalArgumentException("fieldName");
60+
this.analyzer = analyzer;
61+
this.fieldName = fieldName;
62+
this.vocab = new CharsDic(Math.max(8, expectedSize * 3));
63+
this.auto = new IntAutomaton();
64+
this.idsBuf = new int[8];
11265
}
11366

11467
/**
115-
* Transition function: follow an arc labeled {@code tokenId} from {@code state}.
116-
*
117-
* @return next state id, or -1 if no transition exists
68+
* Returns the {@link CharsDic} ordinal of the canonical form if {@code state} is
69+
* accepting, or -1 if non-accepting.
70+
* Pass the result to {@link #vocab()} to retrieve char data without allocation.
11871
*/
119-
public int step(final int state, final int tokenId)
72+
public int accept(final int state)
12073
{
121-
return automaton.step(state, tokenId);
74+
return auto.accept(state);
12275
}
12376

12477
/**
125-
* Returns an {@code entryId} if {@code state} is accepting, or -1 otherwise.
78+
* Tokenizes {@code expression} with the analyzer, registers each component token
79+
* in the vocabulary, and adds the token-id sequence to the automaton with the
80+
* canonical form ordinal as accept value.
12681
*
127-
* <p>The returned value indexes {@link #pos(int)}, {@link #lemmaId(int)}, and {@link #copyOutput(int, char[])}.</p>
128-
*/
129-
public int acceptEntry(final int state)
130-
{
131-
return automaton.accept(state);
132-
}
133-
134-
/**
135-
* Maximum number of tokens in any MWE pattern.
82+
* <p>Expressions that yield fewer than two tokens are silently skipped.
83+
* If the same token sequence is added more than once, the last canonical form wins.</p>
13684
*
137-
* <p>This is the correct lookahead bound for longest-match algorithms in a linear TokenStream.</p>
85+
* @param expression canonical MWE string (e.g. {@code "machine learning"})
86+
* @throws IOException if the analyzer throws during tokenization
87+
* @throws IllegalStateException if {@link #freeze()} has already been called
13888
*/
139-
public int maxPatternTokens()
89+
public void addExpression(final CharSequence expression) throws IOException
14090
{
141-
return maxPatternTokens;
142-
}
91+
if (idsBuf == null) throw new IllegalStateException("frozen");
92+
if (expression == null || expression.length() == 0) return;
93+
94+
// Tokenize first: only register in vocab if the sequence is a valid MWE (>= 2 tokens).
95+
int len = 0;
96+
try (TokenStream ts = analyzer.tokenStream(fieldName, expression.toString())) {
97+
final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
98+
ts.reset();
99+
while (ts.incrementToken()) {
100+
// CharTermAttribute implements CharSequence: no toString() copy needed.
101+
final int tokOrd = ord(vocab.add(termAtt));
102+
if (len == idsBuf.length) idsBuf = java.util.Arrays.copyOf(idsBuf, len * 2);
103+
idsBuf[len++] = tokOrd;
104+
}
105+
ts.end();
106+
}
143107

144-
/**
145-
* Backward-compatible alias for {@link #maxPatternTokens()}.
146-
*
147-
* <p>Avoid exposing "len" without unit; prefer {@link #maxPatternTokens()} in new code.</p>
148-
*/
149-
public int maxLen()
150-
{
151-
return maxPatternTokens;
152-
}
108+
if (len < 2) return;
153109

154-
/**
155-
* Number of MWE entries in this lexicon.
156-
*/
157-
public int size()
158-
{
159-
return outputOrd.length;
110+
// Register canonical form only after confirming the expression is valid.
111+
final int formOrd = ord(vocab.add(expression));
112+
auto.add(idsBuf, len, formOrd);
160113
}
161114

162115
/**
163-
* POS tag id for the compound token produced by this entry.
164-
*
165-
* @param entryId entry id returned by {@link #acceptEntry(int)}
116+
* Freezes the vocabulary and packs the automaton into primitive arrays.
117+
* Must be called before any runtime method. Idempotent.
166118
*/
167-
public short pos(final int entryId)
119+
public void freeze()
168120
{
169-
checkEntry(entryId);
170-
return pos[entryId];
121+
vocab.trimToSize();
122+
auto.freeze(false);
123+
idsBuf = null;
171124
}
172125

173126
/**
174-
* Optional lemma id for the compound token produced by this entry.
175-
*
176-
* @param entryId entry id returned by {@link #acceptEntry(int)}
177-
* @return lemma id, or -1 if none is defined
127+
* Upper bound on MWE length in tokens.
128+
* Use to size the token filter's lookahead buffer ({@code maxLen() + 1}).
178129
*/
179-
public int lemmaId(final int entryId)
130+
public int maxLen()
180131
{
181-
checkEntry(entryId);
182-
return lemmaId[entryId];
132+
return auto.maxLen();
183133
}
184134

185-
/**
186-
* Copies the output term for {@code entryId} into {@code dst[0..len)} and returns {@code len}.
187-
*
188-
* <p>This is the hot-path method used by {@link MweFilter} to emit the canonical MWE term text
189-
* without allocating.</p>
190-
*
191-
* <p>Caller responsibility: ensure {@code dst.length >= maxOutputLen()}.</p>
192-
*
193-
* @param entryId entry id returned by {@link #acceptEntry(int)}
194-
* @param dst destination buffer (start at index 0)
195-
* @return length in UTF-16 code units
196-
* @throws IllegalArgumentException if {@code entryId} is invalid or {@code dst} too small
197-
* @throws NullPointerException if {@code dst} is null
198-
*/
199-
public int copyOutput(final int entryId, final char[] dst)
135+
/** Root state; pass as the initial state to the first {@link #step} call per position. */
136+
public int root()
200137
{
201-
checkEntry(entryId);
202-
if (dst == null) throw new NullPointerException("dst");
203-
204-
final int ord = outputOrd[entryId];
205-
206-
// Preferred if you add the safe copy-out API to CharsDic:
207-
// return outputDic.get(ord, dst);
208-
209-
// Fallback using current CharsDic API (slab + offsets) while keeping it encapsulated here:
210-
final int len = outputDic.termLength(ord);
211-
if (dst.length < len) {
212-
throw new IllegalArgumentException("dst too small: dst.length=" + dst.length + " need=" + len);
213-
}
214-
final int off = outputDic.termOffset(ord);
215-
System.arraycopy(outputDic.slab(), off, dst, 0, len);
216-
return len;
138+
return auto.root();
217139
}
218140

219141
/**
220-
* Maximum output term length among all entries, in UTF-16 code units.
142+
* Advances the automaton by one token.
221143
*
222-
* <p>Use this value to allocate a reusable scratch buffer for {@link #copyOutput(int, char[])}.</p>
223-
*/
224-
public int maxOutputLen()
225-
{
226-
return maxOutputLen;
227-
}
228-
229-
/**
230-
* Debug helper: returns the output term as a String (allocates).
144+
* <p>Tokens absent from the vocabulary return -1 immediately without touching
145+
* the automaton — fast-fail for the common case.</p>
231146
*
232-
* @param entryId entry id returned by {@link #acceptEntry(int)}
147+
* @param state current automaton state
148+
* @param buf token character buffer (e.g. {@link CharTermAttribute#buffer()})
149+
* @param len number of valid chars in {@code buf}
150+
* @return next state, or -1 if no transition exists
233151
*/
234-
public String outputAsString(final int entryId)
152+
public int step(final int state, final char[] buf, final int len)
235153
{
236-
checkEntry(entryId);
237-
final int ord = outputOrd[entryId];
238-
final int off = outputDic.termOffset(ord);
239-
final int len = outputDic.termLength(ord);
240-
return new String(outputDic.slab(), off, len);
154+
if (idsBuf != null) throw new IllegalStateException("not frozen");
155+
final int tokOrd = vocab.find(buf, 0, len);
156+
if (tokOrd < 0) return -1;
157+
return auto.step(state, tokOrd);
241158
}
242159

243160
/**
244-
* Exposes the internal automaton (advanced use).
161+
* Shared vocabulary; use ordinals from {@link #accept(int)} to retrieve canonical forms.
245162
*/
246-
public IntAutomaton automaton()
163+
public CharsDic vocab()
247164
{
248-
return automaton;
249-
}
250-
251-
/**
252-
* Exposes the output dictionary (advanced use; immutable by convention).
253-
*/
254-
public CharsDic outputDic()
255-
{
256-
return outputDic;
257-
}
258-
259-
private void checkEntry(final int entryId)
260-
{
261-
if (entryId < 0 || entryId >= outputOrd.length) {
262-
throw new IllegalArgumentException("bad entryId " + entryId);
263-
}
165+
return vocab;
264166
}
265167

266-
@SuppressWarnings("unused")
267-
private void validateAcceptIds(final int entryCount)
168+
/** Recovers a {@link CharsDic} ordinal from the raw return value of {@code add()}. */
169+
private static int ord(final int raw)
268170
{
269-
// IntAutomaton stores accept per state; we can only validate by scanning all states if you expose state count.
270-
// If you later add IntAutomaton.stateCount(), validate accept ids here.
171+
return raw >= 0 ? raw : -raw - 1;
271172
}
272173
}

0 commit comments

Comments
 (0)