Skip to content

Commit 6c44e53

Browse files
committed
Simple query parser
1 parent 140c616 commit 6c44e53

4 files changed

Lines changed: 157 additions & 129 deletions

File tree

common/src/java/com/github/oeuvres/alix/lucene/spans/SpanQueryParser.java

Lines changed: 72 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -6,58 +6,22 @@
66
import org.apache.lucene.queries.spans.SpanQuery;
77
import org.apache.lucene.queries.spans .SpanTermQuery;
88

9+
import com.github.oeuvres.alix.util.WordTokenizer;
10+
911
import java.util.ArrayList;
1012
import java.util.List;
1113

1214
/**
13-
* Parses a pivot specification into a {@link SpanQuery}.
14-
*
15-
* <h2>Input syntax</h2>
16-
* <p>Groups are separated by commas or newlines (both are equivalent).
17-
* Terms within a group are whitespace-separated and combined with OR.
18-
* Groups are combined with AND (all must occur within the span).</p>
19-
*
20-
* <pre>
21-
* libre liberté, responsable responsabilité
22-
* </pre>
23-
* <p>is equivalent to:</p>
24-
* <pre>
25-
* libre liberté
26-
* responsable responsabilité
27-
* </pre>
28-
* <p>and produces:</p>
29-
* <pre>
30-
* SpanNearQuery(
31-
* SpanOrQuery(libre, liberté),
32-
* SpanOrQuery(responsable, responsabilité),
33-
* slop, inOrder=false
34-
* )
35-
* </pre>
15+
* Parses a user query into a {@link SpanQuery}.
3616
*
37-
* <h2>Degenerate cases</h2>
38-
* <table border="1">
39-
* <tr><th>Input</th><th>Result</th></tr>
40-
* <tr><td>{@code libre}</td><td>{@link SpanTermQuery}</td></tr>
41-
* <tr><td>{@code libre liberté}</td><td>{@link SpanOrQuery}</td></tr>
42-
* <tr><td>{@code libre, responsable}</td><td>{@link SpanNearQuery}</td></tr>
43-
* </table>
44-
*
45-
* <h2>Slop semantics</h2>
46-
* <p>Lucene slop counts the minimum number of position moves to bring all
47-
* matched terms adjacent. For a two-group unordered match, pass
48-
* {@code slop = maxGap - 1} to get a maximum token distance of
49-
* {@code maxGap} between the two outermost matched tokens. For three or
50-
* more groups the total span width is {@code slop + numberOfGroups - 1};
51-
* verify this matches intent before use.</p>
52-
*
53-
* <h2>Terms</h2>
54-
* <p>No analysis is applied. Terms are used verbatim; the caller is
55-
* responsible for passing tokens in the form stored in the index.</p>
5617
*/
5718
public class SpanQueryParser {
5819

5920
private final String field;
6021
private final int slop;
22+
private final WordTokenizer tokenizer;
23+
private final String OR_OPEN = "OrOpen";
24+
private final String OR_CLOSE = "OrClose";
6125

6226
/**
6327
* Creates a parser for the given field and slop.
@@ -67,80 +31,89 @@ public class SpanQueryParser {
6731
* token distance of {@code maxGap} between outermost pivots
6832
* @throws IllegalArgumentException if {@code field} is blank or {@code slop} is negative
6933
*/
70-
public SpanQueryParser(final String field, final int slop) {
34+
public SpanQueryParser(final String field, final int slop, final WordTokenizer tokenizer) {
7135
if (field == null || field.isBlank())
7236
throw new IllegalArgumentException("field must not be blank");
7337
if (slop < 0)
7438
throw new IllegalArgumentException("slop must be >= 0, got " + slop);
7539
this.field = field;
7640
this.slop = slop;
41+
this.tokenizer = tokenizer;
7742
}
7843

7944
/**
80-
* Parses the pivot specification and returns the most specific
81-
* {@link SpanQuery} that represents it.
82-
*
83-
* <p>The return type depends on the number of groups found:</p>
84-
* <ul>
85-
* <li>0 non-empty groups → {@link IllegalArgumentException}</li>
86-
* <li>1 group, 1 term → {@link SpanTermQuery}</li>
87-
* <li>1 group, N terms → {@link SpanOrQuery}</li>
88-
* <li>2+ groups → {@link SpanNearQuery} whose clauses are
89-
* {@link SpanTermQuery} (single-term group) or
90-
* {@link SpanOrQuery} (multi-term group)</li>
91-
* </ul>
45+
* Parses the user query.
9246
*
93-
* @param spec pivot specification; groups separated by {@code ,} or
94-
* newline; terms within a group separated by whitespace
95-
* @return assembled query
96-
* @throws IllegalArgumentException if {@code spec} is blank or yields no terms
47+
* @param queryText user query text
48+
* @return assembled span query, or {@code null} if the query is blank or yields no term
9749
*/
98-
public SpanQuery parse(final String spec) {
99-
if (spec == null || spec.isBlank())
100-
throw new IllegalArgumentException("spec must not be blank");
101-
102-
final List<SpanQuery> groups = new ArrayList<>();
103-
for (final String groupStr : spec.split("[,\\n\\r]+")) {
104-
final String trimmed = groupStr.strip();
105-
if (trimmed.isEmpty()) continue;
106-
final SpanQuery group = buildGroup(trimmed);
107-
if (group != null) groups.add(group);
50+
public SpanQuery parse(final String queryText) {
51+
if (queryText == null || queryText.isBlank()) {
52+
return null; // let caller alert user
10853
}
10954

110-
switch (groups.size()) {
111-
case 0:
112-
throw new IllegalArgumentException("spec contains no usable terms: \"" + spec + "\"");
113-
case 1:
114-
return groups.get(0);
115-
default:
116-
return new SpanNearQuery(
117-
groups.toArray(new SpanQuery[0]),
118-
slop,
119-
false
120-
);
55+
final String q = queryText
56+
.replace("(", " " + OR_OPEN + " ")
57+
.replace(")", " " + OR_CLOSE + " ");
58+
59+
final List<String> words = tokenizer.tokenize(q);
60+
final List<SpanQuery> clauses = new ArrayList<>();
61+
List<SpanQuery> orClauses = null;
62+
63+
for (final String word : words) {
64+
if (OR_OPEN.equals(word)) {
65+
if (orClauses == null) {
66+
orClauses = new ArrayList<>();
67+
}
68+
continue; // nested opening parenthesis: skip silently
69+
}
70+
71+
if (OR_CLOSE.equals(word)) {
72+
if (orClauses == null) {
73+
continue; // closing parenthesis without opening: skip silently
74+
}
75+
if (orClauses.size() == 1) {
76+
clauses.add(orClauses.get(0));
77+
}
78+
else if (!orClauses.isEmpty()) {
79+
clauses.add(new SpanOrQuery(orClauses.toArray(new SpanQuery[0])));
80+
}
81+
orClauses = null;
82+
continue;
83+
}
84+
85+
// TODO eliminate stop words
86+
// TODO hunspell lemmatize
87+
// TODO concat know multi-word expression for the field
88+
// TODO first suggest hunspell
89+
// TODO eliminate unknown word from field
90+
final SpanQuery term = new SpanTermQuery(new Term(field, word));
91+
92+
if (orClauses != null) {
93+
orClauses.add(term);
94+
}
95+
else {
96+
clauses.add(term);
97+
}
12198
}
122-
}
12399

124-
/**
125-
* Builds a {@link SpanTermQuery} or {@link SpanOrQuery} from the
126-
* whitespace-separated terms of one group.
127-
*
128-
* @param groupStr non-empty, already stripped group string
129-
* @return query, or {@code null} if the string yields no tokens
130-
*/
131-
private SpanQuery buildGroup(final String groupStr) {
132-
final String[] tokens = groupStr.split("\\s+");
133-
final List<SpanTermQuery> alternatives = new ArrayList<>(tokens.length);
134-
for (String token : tokens) {
135-
token = token.replace('_', ' ');
136-
if (!token.isEmpty()) {
137-
alternatives.add(new SpanTermQuery(new Term(field, token)));
100+
// Opening parenthesis without closing: go to end of query string.
101+
if (orClauses != null) {
102+
if (orClauses.size() == 1) {
103+
clauses.add(orClauses.get(0));
104+
}
105+
else if (!orClauses.isEmpty()) {
106+
clauses.add(new SpanOrQuery(orClauses.toArray(new SpanQuery[0])));
138107
}
139108
}
140-
switch (alternatives.size()) {
141-
case 0: return null;
142-
case 1: return alternatives.get(0);
143-
default: return new SpanOrQuery(alternatives.toArray(new SpanQuery[0]));
109+
110+
if (clauses.isEmpty()) {
111+
return null;
144112
}
113+
if (clauses.size() == 1) {
114+
return clauses.get(0);
115+
}
116+
return new SpanNearQuery(clauses.toArray(new SpanQuery[0]), slop, true);
145117
}
118+
146119
}

util/src/java/com/github/oeuvres/alix/util/CharsDic.java

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414
*/
1515
package com.github.oeuvres.alix.util;
1616

17+
import java.io.IOException;
18+
import java.io.UncheckedIOException;
1719
import java.util.Arrays;
20+
import java.util.Objects;
1821

1922
/**
2023
* Dependency-free hash dictionary of UTF-16 character sequences with stable
@@ -221,6 +224,51 @@ public int add(final CharSequence key, final int off, final int len)
221224
return intern(null, off, len, key);
222225
}
223226

227+
/**
228+
* Appends the sequence stored at {@code ord} to an {@link Appendable}.
229+
*
230+
* <p>If {@code ord} is negative, typically {@link #NOT_IN_DIC}, the same
231+
* negative value is returned and {@code dst} is left untouched. This mirrors
232+
* {@link #copy(int, char[], int)} and allows callers to propagate lookup
233+
* misses without a separate branch.</p>
234+
*
235+
* <p>The appended characters are the stored UTF-16 code units. In
236+
* ignore-case dictionaries, this means the lowercased form stored in the slab
237+
* is appended, not the original input spelling.</p>
238+
*
239+
* @param ord ord to read; negative values pass through
240+
* @param dst destination appendable, required only when {@code ord >= 0}
241+
* @return the number of chars appended, or {@code ord} unchanged if negative
242+
* @throws NullPointerException if {@code dst} is {@code null} and
243+
* {@code ord >= 0}
244+
* @throws IllegalArgumentException if {@code ord >= size()}
245+
* @throws UncheckedIOException if {@code dst} throws IOException while appending
246+
*/
247+
public int append(final int ord, final Appendable dst)
248+
{
249+
if (ord < 0) {
250+
return ord;
251+
}
252+
if (ord >= sizeOrds) {
253+
throw new IllegalArgumentException("bad ord " + ord + " (size=" + sizeOrds + ")");
254+
}
255+
Objects.requireNonNull(dst, "dst");
256+
257+
final long m = meta[ord];
258+
int off = metaOff(m);
259+
final int len = metaLen(m);
260+
final int lim = off + len;
261+
for (; off < lim; off++) {
262+
try {
263+
dst.append(slab[off]);
264+
}
265+
catch (IOException e) {
266+
throw new UncheckedIOException(e);
267+
}
268+
}
269+
return len;
270+
}
271+
224272
/**
225273
* Returns the stored sequence at {@code ord} as a newly allocated string.
226274
*
@@ -230,6 +278,7 @@ public int add(final CharSequence key, final int off, final int len)
230278
*/
231279
public String asString(final int ord)
232280
{
281+
if (ord < 0) return null;
233282
checkOrd(ord);
234283
final long m = meta[ord];
235284
return new String(slab, metaOff(m), metaLen(m));
@@ -319,7 +368,7 @@ public int copy(final int ord, final char[] dst, final int dstOff)
319368
System.arraycopy(slab, metaOff(m), dst, dstOff, len);
320369
return len;
321370
}
322-
371+
323372
/**
324373
* Returns the length of the sequence stored at {@code ord}.
325374
*

util/src/java/com/github/oeuvres/alix/util/CharsMap.java

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
*/
1212
package com.github.oeuvres.alix.util;
1313

14+
import java.io.IOException;
15+
import java.io.UncheckedIOException;
1416
import java.util.Arrays;
1517
import java.util.Objects;
1618

@@ -74,6 +76,27 @@ public CharsMap(final int expectedSize)
7476
this.values = new int[Math.max(8, expectedSize)];
7577
Arrays.fill(this.values, HAS_NO_VALUE);
7678
}
79+
80+
/**
81+
* Appends the sequence stored at {@code ord} to an {@link Appendable}.
82+
*
83+
* <p>If {@code ord} is negative, typically {@link #NOT_IN_DIC}, the same
84+
* negative value is returned and {@code dst} is left untouched. This allows
85+
* callers to propagate lookup
86+
* misses without a separate branch.</p>
87+
*
88+
* @param ord ord to read; negative values pass through
89+
* @param dst destination appendable, required only when {@code ord >= 0}
90+
* @return the number of chars appended, or {@code ord} unchanged if negative
91+
* @throws NullPointerException if {@code dst} is {@code null} and
92+
* {@code ord >= 0}
93+
* @throws IllegalArgumentException if {@code ord >= size()}
94+
* @throws UncheckedIOException if {@code dst} throws IOException while appending
95+
*/
96+
public int append(final int ord, final Appendable dst)
97+
{
98+
return dic.append(ord, dst);
99+
}
77100

78101
/**
79102
* Returns the stored sequence at {@code ord} as a newly allocated string.

0 commit comments

Comments
 (0)