Skip to content

Commit 75745d1

Browse files
committed
form() for json API
1 parent 1084790 commit 75745d1

8 files changed

Lines changed: 90 additions & 70 deletions

File tree

common/src/java/com/github/oeuvres/alix/lucene/terms/TermLexicon.java

Lines changed: 50 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
* Real term ids start at 1 and are dense and stable for the frozen snapshot from which the
4747
* lexicon was built. The id assignment follows the lexicographic iteration order returned by
4848
* Lucene's merged {@link TermsEnum} for the field. The reserved id is stored as a zero-length
49-
* phantom entry in the {@code .dat}/{@code .off} files so that {@link #term(int) term(0)}
49+
* phantom entry in the {@code .dat}/{@code .off} files so that {@link #form(int) term(0)}
5050
* returns {@code ""} and all sidecar files remain self-consistent.
5151
* </p>
5252
* <p>
@@ -107,7 +107,7 @@ public final class TermLexicon implements Closeable {
107107
private static final ThreadLocal<BytesRefBuilder> TL_TERM_BYTES =
108108
ThreadLocal.withInitial(BytesRefBuilder::new);
109109

110-
/** Per-thread reusable scratch buffer for {@link #term(int)} to avoid allocation per call. */
110+
/** Per-thread reusable scratch buffer for {@link #form(int)} to avoid allocation per call. */
111111
private static final ThreadLocal<BytesRefBuilder> TL_TERM_STRING =
112112
ThreadLocal.withInitial(BytesRefBuilder::new);
113113

@@ -254,6 +254,54 @@ public String field() {
254254
return field;
255255
}
256256

257+
/**
258+
* Returns the term string for one dense term id.
259+
* <p>
260+
* {@code term(0)} returns the empty string (reserved absent-term slot).
261+
* Uses a per-thread scratch buffer internally. Suitable for moderate use
262+
* (e.g. resolving 50 term ids for display). For tight loops over the full
263+
* vocabulary, prefer {@link #formBytes(int, BytesRefBuilder)} with a caller-owned buffer.
264+
* </p>
265+
*
266+
* @param termId dense term id in {@code [0, vocabSize)}
267+
* @return decoded UTF-8 term string, never null; empty for the reserved id 0
268+
* @throws IllegalArgumentException if {@code termId} is out of range
269+
*/
270+
public String form(final int termId) {
271+
return formBytes(termId, TL_TERM_STRING.get()).utf8ToString();
272+
}
273+
274+
/**
275+
* Copies the raw UTF-8 bytes of one term into a caller-provided reusable buffer.
276+
* <p>
277+
* This avoids allocation when called in a loop. The bytes are read directly
278+
* from the memory-mapped {@code .dat} buffer.
279+
* </p>
280+
*
281+
* @param termId dense term id in {@code [0, vocabSize)}
282+
* @param reuse destination buffer that will receive the term bytes;
283+
* grown automatically if needed
284+
* @return {@code reuse.get()} after the copy, valid until the next call on the same buffer
285+
* @throws IllegalArgumentException if {@code termId} is out of range
286+
* @throws NullPointerException if {@code reuse} is null
287+
*/
288+
public BytesRef formBytes(final int termId, final BytesRefBuilder reuse) {
289+
checkTermId(termId);
290+
Objects.requireNonNull(reuse, "reuse");
291+
292+
final int start = off.get(termId);
293+
final int end = off.get(termId + 1);
294+
final int length = end - start;
295+
296+
reuse.grow(length);
297+
final ByteBuffer dup = dat.duplicate();
298+
dup.position(start);
299+
dup.limit(end);
300+
dup.get(reuse.bytes(), 0, length);
301+
reuse.setLength(length);
302+
return reuse.get();
303+
}
304+
257305
/**
258306
* Returns the in-memory heap usage of the loaded FST, in bytes.
259307
* <p>
@@ -412,55 +460,6 @@ public static TermLexicon openOrBuild(final IndexReader reader, final Path sideD
412460
return open(sideDir, field);
413461
}
414462

415-
/**
416-
* Returns the term string for one dense term id.
417-
* <p>
418-
* {@code term(0)} returns the empty string (reserved absent-term slot).
419-
* Uses a per-thread scratch buffer internally. Suitable for moderate use
420-
* (e.g. resolving 50 term ids for display). For tight loops over the full
421-
* vocabulary, prefer {@link #termBytes(int, BytesRefBuilder)} with a caller-owned buffer.
422-
* </p>
423-
*
424-
* @param termId dense term id in {@code [0, vocabSize)}
425-
* @return decoded UTF-8 term string, never null; empty for the reserved id 0
426-
* @throws IllegalArgumentException if {@code termId} is out of range
427-
*/
428-
public String term(final int termId) {
429-
return termBytes(termId, TL_TERM_STRING.get()).utf8ToString();
430-
}
431-
432-
/**
433-
* Copies the raw UTF-8 bytes of one term into a caller-provided reusable buffer.
434-
* <p>
435-
* This avoids allocation when called in a loop. The bytes are read directly
436-
* from the memory-mapped {@code .dat} buffer.
437-
* </p>
438-
*
439-
* @param termId dense term id in {@code [0, vocabSize)}
440-
* @param reuse destination buffer that will receive the term bytes;
441-
* grown automatically if needed
442-
* @return {@code reuse.get()} after the copy, valid until the next call on the same buffer
443-
* @throws IllegalArgumentException if {@code termId} is out of range
444-
* @throws NullPointerException if {@code reuse} is null
445-
*/
446-
public BytesRef termBytes(final int termId, final BytesRefBuilder reuse) {
447-
checkTermId(termId);
448-
Objects.requireNonNull(reuse, "reuse");
449-
450-
final int start = off.get(termId);
451-
final int end = off.get(termId + 1);
452-
final int length = end - start;
453-
454-
reuse.grow(length);
455-
final ByteBuffer dup = dat.duplicate();
456-
dup.position(start);
457-
dup.limit(end);
458-
dup.get(reuse.bytes(), 0, length);
459-
reuse.setLength(length);
460-
return reuse.get();
461-
}
462-
463-
464463
/**
465464
* Returns the number of entries in the lexicon, including the reserved id 0.
466465
* <p>

common/src/java/com/github/oeuvres/alix/lucene/terms/TermSuggest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ public TermSuggest(
105105
for (int termId = 0; termId < vocabSize; termId++) {
106106
sb.append(SEP);
107107
offsets[termId] = sb.length();
108-
sb.append(Char.toAscii(lexicon.term(termId)));
108+
sb.append(Char.toAscii(lexicon.form(termId)));
109109
}
110110
sb.append(SEP);
111111
offsets[vocabSize] = sb.length();
@@ -205,7 +205,7 @@ public TopTerms suggest(
205205
int rank = 0;
206206
for (TopArray.IdScore entry : top) {
207207
final int termId = entry.id();
208-
final String term = lexicon.term(termId);
208+
final String term = lexicon.form(termId);
209209
final String termFolded = ascii.substring(
210210
offsets[termId],
211211
offsets[termId + 1] - 1

common/src/java/com/github/oeuvres/alix/lucene/terms/TopTerms.java

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -980,6 +980,26 @@ public final class TermEntry
980980
this.termId = termId;
981981
}
982982

983+
/**
984+
* Returns the current doc count for this term
985+
*
986+
* @return current count of document with this term
987+
*/
988+
public long docs()
989+
{
990+
return termDocs[termId];
991+
}
992+
993+
/**
994+
* Returns the global doc count for this term
995+
*
996+
* @return global count of document with this term
997+
*/
998+
public long fieldDocs()
999+
{
1000+
return fieldStats.termDocs(termId);
1001+
}
1002+
9831003
/**
9841004
* Returns the full-field occurrence count.
9851005
*
@@ -1030,9 +1050,9 @@ public double score()
10301050
*
10311051
* @return display term
10321052
*/
1033-
public String term()
1053+
public String form()
10341054
{
1035-
return lexicon.term(termId);
1055+
return lexicon.form(termId);
10361056
}
10371057

10381058
/**

test/src/main/java/com/github/oeuvres/alix/lucene/terms/CoocDemo.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ public static void main(String[] args) throws Exception {
234234
final int limit = Math.min(topN, entries.size());
235235
for (int i = 0; i < limit; i++) {
236236
final Entry e = entries.get(i);
237-
System.out.printf(" %-28s %8d %8.4f%n", lex.term(e.termId()), e.count(), e.score());
237+
System.out.printf(" %-28s %8d %8.4f%n", lex.form(e.termId()), e.count(), e.score());
238238
}
239239
if (entries.isEmpty()) {
240240
System.out.println(" (no co-occurrents above minCount=" + minCount + ")");

test/src/main/java/com/github/oeuvres/alix/lucene/terms/TermLexiconDemo.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,10 @@ public static void main(String[] args) throws Exception {
4242
return;
4343
}
4444

45-
System.out.println("term(" + termId + ") = " + lexicon.term(termId));
45+
System.out.println("term(" + termId + ") = " + lexicon.form(termId));
4646

4747

48-
final BytesRef termBytes = new BytesRef(lexicon.term(termId));
48+
final BytesRef termBytes = new BytesRef(lexicon.form(termId));
4949
final PostingsEnum postings = MultiTerms.getTermPostingsEnum(
5050
reader,
5151
field,

test/src/main/java/com/github/oeuvres/alix/lucene/terms/ThemeTermsDemo.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ public static void main(final String[] args) throws Exception {
8181
fieldStats.termWeights(luceneReader, scorer);
8282
TopTerms top = TopTerms.theme(fieldStats, lexicon, topK);
8383
for(TermEntry term: top) {
84-
System.out.print(term.term() + ", ");
84+
System.out.print(term.form() + ", ");
8585
}
8686
}
8787

web/src/main/java/com/github/oeuvres/alix/web/OpSuggest.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,12 @@ protected void json(
132132
for (TermEntry term : topTerms) {
133133
jw.beginObject();
134134
jw.name("rank").value(rank++);
135-
jw.name("term").value(term.term());
135+
jw.name("term").value(term.form());
136136
jw.name("html").value(term.hilite());
137-
jw.name("count").value(term.freq());
137+
jw.name("docs").value(term.docs());
138+
jw.name("freq").value(term.freq());
139+
jw.name("fieldDocs").value(term.fieldDocs());
140+
jw.name("fieldFreq").value(term.fieldFreq());
138141
// score has no sense here
139142
jw.endObject();
140143
}

web/src/main/java/com/github/oeuvres/alix/web/OpTerms.java

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -139,12 +139,7 @@ else if ("part4".equals(scorerName)) {
139139
topTerms.focus(index.reader(), focusDocs);
140140
return topTerms.rank(new KeynessScorer.LogLikelihood(), topK);
141141
}
142-
else if ("rsj".equals(scorerName)) {
143-
return topTerms.focus(index.reader(), focusDocs, new IdfTermScorer.BM25(idfExp, IdfTermScorer.BM25.Mode.RSJ), topK);
144-
}
145-
else if ("irdf".equals(scorerName)) {
146-
return topTerms.focus(index.reader(), focusDocs, new IdfTermScorer.BM25(idfExp, IdfTermScorer.BM25.Mode.IRDF), topK);
147-
}
142+
148143
else if ("chi2".equals(scorerName)) {
149144
topTerms.focus(index.reader(), focusDocs);
150145
return topTerms.rank(new KeynessScorer.Chi2(), topK);
@@ -260,7 +255,7 @@ protected void html(LuceneIndex index, HttpServletRequest request, HttpServletRe
260255
for (TermEntry term : topTerms) {
261256
writer.append(" <tr>\n")
262257
.append(" <th class=\"term\">%d</th>\n".formatted(rank++))
263-
.append(" <td class=\"term\">%s</td>\n".formatted(term.term()))
258+
.append(" <td class=\"term\">%s</td>\n".formatted(term.form()))
264259
.append(" <td class=\"count\" align=\"right\">%d</td>\n".formatted(term.freq()))
265260
.append(" <td class=\"score\" align=\"right\">%f</td>\n".formatted(term.score()))
266261
.append(" </tr>\n");
@@ -302,9 +297,12 @@ protected void json(
302297
for (TermEntry term : topTerms) {
303298
jw.beginObject();
304299
jw.name("rank").value(rank++);
305-
jw.name("term").value(term.term());
306-
jw.name("count").value(term.freq());
300+
jw.name("form").value(term.form());
301+
jw.name("docs").value(term.docs());
302+
jw.name("freq").value(term.freq());
307303
jw.name("score").value(term.score());
304+
jw.name("fieldDocs").value(term.fieldDocs());
305+
jw.name("fieldFreq").value(term.fieldFreq());
308306
jw.endObject();
309307
}
310308
jw.endArray();

0 commit comments

Comments
 (0)