66import org .apache .lucene .queries .spans .SpanQuery ;
77import org .apache .lucene .queries .spans .SpanTermQuery ;
88
9+ import com .github .oeuvres .alix .util .WordTokenizer ;
10+
911import java .util .ArrayList ;
1012import java .util .List ;
1113
1214/**
13- * Parses a pivot specification into a {@link SpanQuery}.
14- *
15- * <h2>Input syntax</h2>
16- * <p>Groups are separated by commas or newlines (both are equivalent).
17- * Terms within a group are whitespace-separated and combined with OR.
18- * Groups are combined with AND (all must occur within the span).</p>
19- *
20- * <pre>
21- * libre liberté, responsable responsabilité
22- * </pre>
23- * <p>is equivalent to:</p>
24- * <pre>
25- * libre liberté
26- * responsable responsabilité
27- * </pre>
28- * <p>and produces:</p>
29- * <pre>
30- * SpanNearQuery(
31- * SpanOrQuery(libre, liberté),
32- * SpanOrQuery(responsable, responsabilité),
33- * slop, inOrder=false
34- * )
35- * </pre>
15+ * Parses a user query into a {@link SpanQuery}.
3616 *
37- * <h2>Degenerate cases</h2>
38- * <table border="1">
39- * <tr><th>Input</th><th>Result</th></tr>
40- * <tr><td>{@code libre}</td><td>{@link SpanTermQuery}</td></tr>
41- * <tr><td>{@code libre liberté}</td><td>{@link SpanOrQuery}</td></tr>
42- * <tr><td>{@code libre, responsable}</td><td>{@link SpanNearQuery}</td></tr>
43- * </table>
44- *
45- * <h2>Slop semantics</h2>
46- * <p>Lucene slop counts the minimum number of position moves to bring all
47- * matched terms adjacent. For a two-group unordered match, pass
48- * {@code slop = maxGap - 1} to get a maximum token distance of
49- * {@code maxGap} between the two outermost matched tokens. For three or
50- * more groups the total span width is {@code slop + numberOfGroups - 1};
51- * verify this matches intent before use.</p>
52- *
53- * <h2>Terms</h2>
54- * <p>No analysis is applied. Terms are used verbatim; the caller is
55- * responsible for passing tokens in the form stored in the index.</p>
5617 */
5718public class SpanQueryParser {
5819
5920 private final String field ;
6021 private final int slop ;
22+ private final WordTokenizer tokenizer ;
23+ private final String OR_OPEN = "OrOpen" ;
24+ private final String OR_CLOSE = "OrClose" ;
6125
6226 /**
6327 * Creates a parser for the given field and slop.
@@ -67,80 +31,89 @@ public class SpanQueryParser {
6731 * token distance of {@code maxGap} between outermost pivots
6832 * @throws IllegalArgumentException if {@code field} is blank or {@code slop} is negative
6933 */
70- public SpanQueryParser (final String field , final int slop ) {
34+ public SpanQueryParser (final String field , final int slop , final WordTokenizer tokenizer ) {
7135 if (field == null || field .isBlank ())
7236 throw new IllegalArgumentException ("field must not be blank" );
7337 if (slop < 0 )
7438 throw new IllegalArgumentException ("slop must be >= 0, got " + slop );
7539 this .field = field ;
7640 this .slop = slop ;
41+ this .tokenizer = tokenizer ;
7742 }
7843
7944 /**
80- * Parses the pivot specification and returns the most specific
81- * {@link SpanQuery} that represents it.
82- *
83- * <p>The return type depends on the number of groups found:</p>
84- * <ul>
85- * <li>0 non-empty groups → {@link IllegalArgumentException}</li>
86- * <li>1 group, 1 term → {@link SpanTermQuery}</li>
87- * <li>1 group, N terms → {@link SpanOrQuery}</li>
88- * <li>2+ groups → {@link SpanNearQuery} whose clauses are
89- * {@link SpanTermQuery} (single-term group) or
90- * {@link SpanOrQuery} (multi-term group)</li>
91- * </ul>
45+ * Parses the user query.
9246 *
93- * @param spec pivot specification; groups separated by {@code ,} or
94- * newline; terms within a group separated by whitespace
95- * @return assembled query
96- * @throws IllegalArgumentException if {@code spec} is blank or yields no terms
47+ * @param queryText user query text
48+ * @return assembled span query, or {@code null} if the query is blank or yields no term
9749 */
98- public SpanQuery parse (final String spec ) {
99- if (spec == null || spec .isBlank ())
100- throw new IllegalArgumentException ("spec must not be blank" );
101-
102- final List <SpanQuery > groups = new ArrayList <>();
103- for (final String groupStr : spec .split ("[,\\ n\\ r]+" )) {
104- final String trimmed = groupStr .strip ();
105- if (trimmed .isEmpty ()) continue ;
106- final SpanQuery group = buildGroup (trimmed );
107- if (group != null ) groups .add (group );
50+ public SpanQuery parse (final String queryText ) {
51+ if (queryText == null || queryText .isBlank ()) {
52+ return null ; // let caller alert user
10853 }
10954
110- switch (groups .size ()) {
111- case 0 :
112- throw new IllegalArgumentException ("spec contains no usable terms: \" " + spec + "\" " );
113- case 1 :
114- return groups .get (0 );
115- default :
116- return new SpanNearQuery (
117- groups .toArray (new SpanQuery [0 ]),
118- slop ,
119- false
120- );
55+ final String q = queryText
56+ .replace ("(" , " " + OR_OPEN + " " )
57+ .replace (")" , " " + OR_CLOSE + " " );
58+
59+ final List <String > words = tokenizer .tokenize (q );
60+ final List <SpanQuery > clauses = new ArrayList <>();
61+ List <SpanQuery > orClauses = null ;
62+
63+ for (final String word : words ) {
64+ if (OR_OPEN .equals (word )) {
65+ if (orClauses == null ) {
66+ orClauses = new ArrayList <>();
67+ }
68+ continue ; // nested opening parenthesis: skip silently
69+ }
70+
71+ if (OR_CLOSE .equals (word )) {
72+ if (orClauses == null ) {
73+ continue ; // closing parenthesis without opening: skip silently
74+ }
75+ if (orClauses .size () == 1 ) {
76+ clauses .add (orClauses .get (0 ));
77+ }
78+ else if (!orClauses .isEmpty ()) {
79+ clauses .add (new SpanOrQuery (orClauses .toArray (new SpanQuery [0 ])));
80+ }
81+ orClauses = null ;
82+ continue ;
83+ }
84+
85+ // TODO eliminate stop words
86+ // TODO hunspell lemmatize
87+ // TODO concat know multi-word expression for the field
88+ // TODO first suggest hunspell
89+ // TODO eliminate unknown word from field
90+ final SpanQuery term = new SpanTermQuery (new Term (field , word ));
91+
92+ if (orClauses != null ) {
93+ orClauses .add (term );
94+ }
95+ else {
96+ clauses .add (term );
97+ }
12198 }
122- }
12399
124- /**
125- * Builds a {@link SpanTermQuery} or {@link SpanOrQuery} from the
126- * whitespace-separated terms of one group.
127- *
128- * @param groupStr non-empty, already stripped group string
129- * @return query, or {@code null} if the string yields no tokens
130- */
131- private SpanQuery buildGroup (final String groupStr ) {
132- final String [] tokens = groupStr .split ("\\ s+" );
133- final List <SpanTermQuery > alternatives = new ArrayList <>(tokens .length );
134- for (String token : tokens ) {
135- token = token .replace ('_' , ' ' );
136- if (!token .isEmpty ()) {
137- alternatives .add (new SpanTermQuery (new Term (field , token )));
100+ // Opening parenthesis without closing: go to end of query string.
101+ if (orClauses != null ) {
102+ if (orClauses .size () == 1 ) {
103+ clauses .add (orClauses .get (0 ));
104+ }
105+ else if (!orClauses .isEmpty ()) {
106+ clauses .add (new SpanOrQuery (orClauses .toArray (new SpanQuery [0 ])));
138107 }
139108 }
140- switch (alternatives .size ()) {
141- case 0 : return null ;
142- case 1 : return alternatives .get (0 );
143- default : return new SpanOrQuery (alternatives .toArray (new SpanQuery [0 ]));
109+
110+ if (clauses .isEmpty ()) {
111+ return null ;
144112 }
113+ if (clauses .size () == 1 ) {
114+ return clauses .get (0 );
115+ }
116+ return new SpanNearQuery (clauses .toArray (new SpanQuery [0 ]), slop , true );
145117 }
118+
146119}
0 commit comments