Skip to content

Commit ac90901

Browse files
author
Kalyan Kanuri
committed
fix: Address third-round review feedback from jongyoul
- Use INDEX_FILE_NAME constant in loadIndex() instead of the hardcoded literal (inline comment from reviewer). - Remove stale one-line Javadoc referencing v1/v2/v3 formats — only v3 is supported after the previous round dropped legacy branches. - Add TODO markers at the three locations tracked as follow-up work beyond this PR's scope: * ZEPPELIN-6412 — shard persistence per note so one paragraph edit doesn't rewrite the entire index file. * ZEPPELIN-6413 — IndexEntry in-memory duplication (rehydrate from Notebook.processNote() at query time). * ZEPPELIN-6414 — apply authorization filter before Phase-1 table collection and the top-K cutoff so inaccessible notes don't contaminate ranking. JIRA: https://issues.apache.org/jira/browse/ZEPPELIN-6411
1 parent 1db22a2 commit ac90901

1 file changed

Lines changed: 13 additions & 2 deletions

File tree

zeppelin-server/src/main/java/org/apache/zeppelin/search/EmbeddingSearch.java

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,9 @@ public class EmbeddingSearch extends SearchService {
145145
});
146146

147147
/** A single indexed document (paragraph or note name). */
148+
// TODO(ZEPPELIN-6413): Reduce in-memory duplication by keeping only {embedding, docId} here
149+
// and rehydrating text/title/output from Notebook.processNote() at query time. Needs a perf
150+
// comparison against the current in-memory path and a consistency story on LRU eviction.
148151
private static class IndexEntry {
149152
final float[] embedding;
150153
final String noteName;
@@ -480,6 +483,11 @@ private String buildParagraphText(String noteName, Paragraph p) {
480483
// ---- SearchService implementation ----
481484

482485
@Override
486+
// TODO(ZEPPELIN-6414): Accept user/roles (or a readability Predicate) and apply the auth
487+
// filter before Phase-1 table collection and before the top-K cutoff. Currently the REST
488+
// layer filters after truncation, which can hide results the caller is authorized for and
489+
// lets inaccessible notes contaminate the table-boost ranking. Requires a SearchService
490+
// interface change that also affects LuceneSearch.
483491
public List<Map<String, String>> query(String queryStr) {
484492
if (StringUtils.isBlank(queryStr) || index.isEmpty()) {
485493
return Collections.emptyList();
@@ -805,6 +813,10 @@ static String formatId(String noteId, Paragraph p) {
805813
* Format: [int:version=INDEX_VERSION][int:count] then for each entry:
806814
* [utf:docId] [utf:noteName] [utf:text] [utf:title] [utf:tables] [utf:output] [float[384]:embedding]
807815
*/
816+
// TODO(ZEPPELIN-6412): Shard persistence by note (e.g. index/notes/<noteId>.bin) so a single
817+
// paragraph edit only rewrites that note's file instead of the full index. Needs a per-note
818+
// lock strategy, a manifest for load, and a compaction path for deletes; may also revisit
819+
// append-only log + periodic compaction as the persistence model.
808820
private void saveIndex() throws IOException {
809821
Path file = indexPath.resolve(INDEX_FILE_NAME);
810822
Path tmpFile = indexPath.resolve(INDEX_FILE_NAME + ".tmp");
@@ -857,7 +869,6 @@ private void saveIndex() throws IOException {
857869
}
858870
}
859871

860-
/** Load index from disk if it exists. Supports v1/v2/v3 formats. */
861872
/**
862873
* Load the index from disk.
863874
*
@@ -866,7 +877,7 @@ private void saveIndex() throws IOException {
866877
* signalling the caller to trigger a bootstrap rebuild.
867878
*/
868879
private boolean loadIndex() {
869-
Path file = indexPath.resolve("embedding_index.bin");
880+
Path file = indexPath.resolve(INDEX_FILE_NAME);
870881
if (!Files.exists(file)) {
871882
return true;
872883
}

0 commit comments

Comments
 (0)