Simple query parser

glorieux-f · glorieux-f · commit 6c44e5316cd3 · 2026-05-10T21:55:01.000+02:00
diff --git a/common/src/java/com/github/oeuvres/alix/lucene/spans/SpanQueryParser.java b/common/src/java/com/github/oeuvres/alix/lucene/spans/SpanQueryParser.java
@@ -6,58 +6,22 @@
 import org.apache.lucene.queries.spans.SpanQuery;
 import org.apache.lucene.queries.spans .SpanTermQuery;
 
+import com.github.oeuvres.alix.util.WordTokenizer;
+
 import java.util.ArrayList;
 import java.util.List;
 
 /**
- * Parses a pivot specification into a {@link SpanQuery}.
- *
- * <h2>Input syntax</h2>
- * <p>Groups are separated by commas or newlines (both are equivalent).
- * Terms within a group are whitespace-separated and combined with OR.
- * Groups are combined with AND (all must occur within the span).</p>
- *
- * <pre>
- * libre liberté, responsable responsabilité
- * </pre>
- * <p>is equivalent to:</p>
- * <pre>
- * libre liberté
- * responsable responsabilité
- * </pre>
- * <p>and produces:</p>
- * <pre>
- * SpanNearQuery(
- *   SpanOrQuery(libre, liberté),
- *   SpanOrQuery(responsable, responsabilité),
- *   slop, inOrder=false
- * )
- * </pre>
+ * Parses a user query into a {@link SpanQuery}.
  *
- * <h2>Degenerate cases</h2>
- * <table border="1">
- *   <tr><th>Input</th><th>Result</th></tr>
- *   <tr><td>{@code libre}</td><td>{@link SpanTermQuery}</td></tr>
- *   <tr><td>{@code libre liberté}</td><td>{@link SpanOrQuery}</td></tr>
- *   <tr><td>{@code libre, responsable}</td><td>{@link SpanNearQuery}</td></tr>
- * </table>
- *
- * <h2>Slop semantics</h2>
- * <p>Lucene slop counts the minimum number of position moves to bring all
- * matched terms adjacent. For a two-group unordered match, pass
- * {@code slop = maxGap - 1} to get a maximum token distance of
- * {@code maxGap} between the two outermost matched tokens. For three or
- * more groups the total span width is {@code slop + numberOfGroups - 1};
- * verify this matches intent before use.</p>
- *
- * <h2>Terms</h2>
- * <p>No analysis is applied. Terms are used verbatim; the caller is
- * responsible for passing tokens in the form stored in the index.</p>
  */
 public class SpanQueryParser {
 
     private final String field;
     private final int slop;
+    private final WordTokenizer tokenizer;
+    private final String OR_OPEN = "OrOpen";
+    private final String OR_CLOSE = "OrClose";
 
     /**
      * Creates a parser for the given field and slop.
@@ -67,80 +31,89 @@ public class SpanQueryParser {
      *              token distance of {@code maxGap} between outermost pivots
      * @throws IllegalArgumentException if {@code field} is blank or {@code slop} is negative
      */
-    public SpanQueryParser(final String field, final int slop) {
+    public SpanQueryParser(final String field, final int slop, final WordTokenizer tokenizer) {
         if (field == null || field.isBlank())
             throw new IllegalArgumentException("field must not be blank");
         if (slop < 0)
             throw new IllegalArgumentException("slop must be >= 0, got " + slop);
         this.field = field;
         this.slop = slop;
+        this.tokenizer = tokenizer;
     }
 
     /**
-     * Parses the pivot specification and returns the most specific
-     * {@link SpanQuery} that represents it.
-     *
-     * <p>The return type depends on the number of groups found:</p>
-     * <ul>
-     *   <li>0 non-empty groups → {@link IllegalArgumentException}</li>
-     *   <li>1 group, 1 term → {@link SpanTermQuery}</li>
-     *   <li>1 group, N terms → {@link SpanOrQuery}</li>
-     *   <li>2+ groups → {@link SpanNearQuery} whose clauses are
-     *       {@link SpanTermQuery} (single-term group) or
-     *       {@link SpanOrQuery} (multi-term group)</li>
-     * </ul>
+     * Parses the user query.
      *
-     * @param spec pivot specification; groups separated by {@code ,} or
-     *             newline; terms within a group separated by whitespace
-     * @return assembled query
-     * @throws IllegalArgumentException if {@code spec} is blank or yields no terms
+     * @param queryText user query text
+     * @return assembled span query, or {@code null} if the query is blank or yields no term
      */
-    public SpanQuery parse(final String spec) {
-        if (spec == null || spec.isBlank())
-            throw new IllegalArgumentException("spec must not be blank");
-
-        final List<SpanQuery> groups = new ArrayList<>();
-        for (final String groupStr : spec.split("[,\\n\\r]+")) {
-            final String trimmed = groupStr.strip();
-            if (trimmed.isEmpty()) continue;
-            final SpanQuery group = buildGroup(trimmed);
-            if (group != null) groups.add(group);
+    public SpanQuery parse(final String queryText) {
+        if (queryText == null || queryText.isBlank()) {
+            return null; // let caller alert user
         }
 
-        switch (groups.size()) {
-            case 0:
-                throw new IllegalArgumentException("spec contains no usable terms: \"" + spec + "\"");
-            case 1:
-                return groups.get(0);
-            default:
-                return new SpanNearQuery(
-                    groups.toArray(new SpanQuery[0]),
-                    slop,
-                    false
-                );
+        final String q = queryText
+            .replace("(", " " + OR_OPEN + " ")
+            .replace(")", " " + OR_CLOSE + " ");
+
+        final List<String> words = tokenizer.tokenize(q);
+        final List<SpanQuery> clauses = new ArrayList<>();
+        List<SpanQuery> orClauses = null;
+
+        for (final String word : words) {
+            if (OR_OPEN.equals(word)) {
+                if (orClauses == null) {
+                    orClauses = new ArrayList<>();
+                }
+                continue; // nested opening parenthesis: skip silently
+            }
+
+            if (OR_CLOSE.equals(word)) {
+                if (orClauses == null) {
+                    continue; // closing parenthesis without opening: skip silently
+                }
+                if (orClauses.size() == 1) {
+                    clauses.add(orClauses.get(0));
+                }
+                else if (!orClauses.isEmpty()) {
+                    clauses.add(new SpanOrQuery(orClauses.toArray(new SpanQuery[0])));
+                }
+                orClauses = null;
+                continue;
+            }
+
+            // TODO eliminate stop words
+            // TODO hunspell lemmatize
+            // TODO concat know multi-word expression for the field
+            // TODO first suggest hunspell
+            // TODO eliminate unknown word from field
+            final SpanQuery term = new SpanTermQuery(new Term(field, word));
+
+            if (orClauses != null) {
+                orClauses.add(term);
+            }
+            else {
+                clauses.add(term);
+            }
         }
-    }
 
-    /**
-     * Builds a {@link SpanTermQuery} or {@link SpanOrQuery} from the
-     * whitespace-separated terms of one group.
-     *
-     * @param groupStr non-empty, already stripped group string
-     * @return query, or {@code null} if the string yields no tokens
-     */
-    private SpanQuery buildGroup(final String groupStr) {
-        final String[] tokens = groupStr.split("\\s+");
-        final List<SpanTermQuery> alternatives = new ArrayList<>(tokens.length);
-        for (String token : tokens) {
-            token = token.replace('_', ' ');
-            if (!token.isEmpty()) {
-                alternatives.add(new SpanTermQuery(new Term(field, token)));
+        // Opening parenthesis without closing: go to end of query string.
+        if (orClauses != null) {
+            if (orClauses.size() == 1) {
+                clauses.add(orClauses.get(0));
+            }
+            else if (!orClauses.isEmpty()) {
+                clauses.add(new SpanOrQuery(orClauses.toArray(new SpanQuery[0])));
             }
         }
-        switch (alternatives.size()) {
-            case 0:  return null;
-            case 1:  return alternatives.get(0);
-            default: return new SpanOrQuery(alternatives.toArray(new SpanQuery[0]));
+
+        if (clauses.isEmpty()) {
+            return null;
         }
+        if (clauses.size() == 1) {
+            return clauses.get(0);
+        }
+        return new SpanNearQuery(clauses.toArray(new SpanQuery[0]), slop, true);
     }
+
 }
diff --git a/util/src/java/com/github/oeuvres/alix/util/CharsDic.java b/util/src/java/com/github/oeuvres/alix/util/CharsDic.java
@@ -14,7 +14,10 @@
  */
 package com.github.oeuvres.alix.util;
 
+import java.io.IOException;
+import java.io.UncheckedIOException;
 import java.util.Arrays;
+import java.util.Objects;
 
 /**
  * Dependency-free hash dictionary of UTF-16 character sequences with stable
@@ -221,6 +224,51 @@ public int add(final CharSequence key, final int off, final int len)
         return intern(null, off, len, key);
     }
 
+    /**
+     * Appends the sequence stored at {@code ord} to an {@link Appendable}.
+     *
+     * <p>If {@code ord} is negative, typically {@link #NOT_IN_DIC}, the same
+     * negative value is returned and {@code dst} is left untouched. This mirrors
+     * {@link #copy(int, char[], int)} and allows callers to propagate lookup
+     * misses without a separate branch.</p>
+     *
+     * <p>The appended characters are the stored UTF-16 code units. In
+     * ignore-case dictionaries, this means the lowercased form stored in the slab
+     * is appended, not the original input spelling.</p>
+     *
+     * @param ord ord to read; negative values pass through
+     * @param dst destination appendable, required only when {@code ord >= 0}
+     * @return the number of chars appended, or {@code ord} unchanged if negative
+     * @throws NullPointerException if {@code dst} is {@code null} and
+     *         {@code ord >= 0}
+     * @throws IllegalArgumentException if {@code ord >= size()}
+     * @throws UncheckedIOException if {@code dst} throws IOException while appending
+     */
+    public int append(final int ord, final Appendable dst)
+    {
+        if (ord < 0) {
+            return ord;
+        }
+        if (ord >= sizeOrds) {
+            throw new IllegalArgumentException("bad ord " + ord + " (size=" + sizeOrds + ")");
+        }
+        Objects.requireNonNull(dst, "dst");
+
+        final long m = meta[ord];
+        int off = metaOff(m);
+        final int len = metaLen(m);
+        final int lim = off + len;
+        for (; off < lim; off++) {
+            try {
+                dst.append(slab[off]);
+            }
+            catch (IOException e) {
+                throw new UncheckedIOException(e);
+            }
+        }
+        return len;
+    }
+
     /**
      * Returns the stored sequence at {@code ord} as a newly allocated string.
      *
@@ -230,6 +278,7 @@ public int add(final CharSequence key, final int off, final int len)
      */
     public String asString(final int ord)
     {
+        if (ord < 0) return null;
         checkOrd(ord);
         final long m = meta[ord];
         return new String(slab, metaOff(m), metaLen(m));
@@ -319,7 +368,7 @@ public int copy(final int ord, final char[] dst, final int dstOff)
         System.arraycopy(slab, metaOff(m), dst, dstOff, len);
         return len;
     }
-
+    
     /**
      * Returns the length of the sequence stored at {@code ord}.
      *
diff --git a/util/src/java/com/github/oeuvres/alix/util/CharsMap.java b/util/src/java/com/github/oeuvres/alix/util/CharsMap.java
@@ -11,6 +11,8 @@
  */
 package com.github.oeuvres.alix.util;
 
+import java.io.IOException;
+import java.io.UncheckedIOException;
 import java.util.Arrays;
 import java.util.Objects;
 
@@ -74,6 +76,27 @@ public CharsMap(final int expectedSize)
         this.values = new int[Math.max(8, expectedSize)];
         Arrays.fill(this.values, HAS_NO_VALUE);
     }
+    
+    /**
+     * Appends the sequence stored at {@code ord} to an {@link Appendable}.
+     *
+     * <p>If {@code ord} is negative, typically {@link #NOT_IN_DIC}, the same
+     * negative value is returned and {@code dst} is left untouched. This allows
+     * callers to propagate lookup
+     * misses without a separate branch.</p>
+     *
+     * @param ord ord to read; negative values pass through
+     * @param dst destination appendable, required only when {@code ord >= 0}
+     * @return the number of chars appended, or {@code ord} unchanged if negative
+     * @throws NullPointerException if {@code dst} is {@code null} and
+     *         {@code ord >= 0}
+     * @throws IllegalArgumentException if {@code ord >= size()}
+     * @throws UncheckedIOException if {@code dst} throws IOException while appending
+     */
+    public int append(final int ord, final Appendable dst)
+    {
+        return dic.append(ord, dst);
+    }
 
     /**
      * Returns the stored sequence at {@code ord} as a newly allocated string.
diff --git a/util/src/java/com/github/oeuvres/alix/util/fr/FrenchCliticTokenizer.java b/util/src/java/com/github/oeuvres/alix/util/fr/FrenchCliticTokenizer.java