Skip to content

Commit a9a11e3

Browse files
author
Kalyan Kanuri
committed
fix: Address Copilot review comments
- Fix table boosting bug: results now re-sorted by boosted score - Add connect/read timeouts to model download (30s/60s) - Atomic index persistence: write to temp file, then rename - Strip <B> highlight tags from LuceneSearch results in both UIs - Hide language badge for unknown content types (return '' not 'text') - Remove unused SNIPPET_LENGTH constant - Share model directory across test methods to avoid 86MB re-download JIRA: https://issues.apache.org/jira/browse/ZEPPELIN-6411
1 parent 254f3de commit a9a11e3

4 files changed

Lines changed: 70 additions & 41 deletions

File tree

zeppelin-web-angular/src/app/pages/workspace/notebook-search/result-item/result-item.component.ts

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,11 @@ export class NotebookSearchResultItemComponent implements OnChanges {
5252
this.displayName = this.result.name ? this.result.name : `Note ${noteId}`;
5353

5454
// snippet = SQL/code, header = tables + output
55-
this.codeText = this.result.snippet || '';
55+
this.codeText = (this.result.snippet || '').replace(/<\/?B>/gi, '');
5656
this.interpreter = this.detectInterpreter(this.codeText);
5757

5858
// Parse header: lines with 📊 are tables, rest is output
59-
const header = this.result.header || '';
59+
const header = (this.result.header || '').replace(/<\/?B>/gi, '');
6060
const lines = header.split('\n');
6161
const tableParts: string[] = [];
6262
const outputParts: string[] = [];
@@ -72,12 +72,24 @@ export class NotebookSearchResultItemComponent implements OnChanges {
7272
}
7373

7474
private detectInterpreter(text: string): string {
75-
if (!text) { return ''; }
76-
if (/select|insert|create|from|where/i.test(text)) { return 'sql'; }
77-
if (/^%(\w*\.)?py/i.test(text)) { return 'python'; }
78-
if (/^%md/i.test(text)) { return 'md'; }
79-
if (/^%sh/i.test(text)) { return 'sh'; }
80-
if (/import |def |class /i.test(text)) { return 'python'; }
81-
return 'text';
75+
if (!text) {
76+
return '';
77+
}
78+
if (/select|insert|create|from|where/i.test(text)) {
79+
return 'sql';
80+
}
81+
if (/^%(\w*\.)?py/i.test(text)) {
82+
return 'python';
83+
}
84+
if (/^%md/i.test(text)) {
85+
return 'md';
86+
}
87+
if (/^%sh/i.test(text)) {
88+
return 'sh';
89+
}
90+
if (/import |def |class /i.test(text)) {
91+
return 'python';
92+
}
93+
return '';
8294
}
8395
}

zeppelin-web/src/app/search/result-list.controller.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ function SearchResultCtrl($scope, $routeParams, searchService) {
5555
let tables = '';
5656
let output = '';
5757
if (note.header) {
58-
note.header.split('\n').forEach(function(line) {
58+
note.header.replace(/<\/?B>/gi, '').split('\n').forEach(function(line) {
5959
if (line.indexOf('📊') === 0) {
6060
tables += (tables ? ', ' : '') + line.substring(2).trim();
6161
} else if (line.trim()) {

zeppelin-zengine/src/main/java/org/apache/zeppelin/search/EmbeddingSearch.java

Lines changed: 39 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ public class EmbeddingSearch extends SearchService {
8888
private static final int MAX_RESULTS = 20;
8989
private static final float MIN_SIMILARITY = 0.25f;
9090
private static final int MAX_TEXT_LENGTH = 1500;
91-
private static final int SNIPPET_LENGTH = 150;
9291

9392
static final String ID_FIELD = "id";
9493
private static final String PARAGRAPH = "paragraph";
@@ -198,7 +197,10 @@ private void initModel() throws OrtException, IOException {
198197

199198
private static void downloadFile(String urlStr, Path dest) throws IOException {
200199
URL url = new URL(urlStr);
201-
try (InputStream in = new BufferedInputStream(url.openStream());
200+
java.net.URLConnection conn = url.openConnection();
201+
conn.setConnectTimeout(30_000);
202+
conn.setReadTimeout(60_000);
203+
try (InputStream in = new BufferedInputStream(conn.getInputStream());
202204
FileOutputStream out = new FileOutputStream(dest.toFile())) {
203205
byte[] buf = new byte[8192];
204206
int n;
@@ -447,9 +449,9 @@ public List<Map<String, String>> query(String queryStr) {
447449
});
448450
}
449451

450-
// Phase 2: re-score with table boost
451-
List<Map<String, String>> results = new ArrayList<>();
452-
for (int i = 0; i < scored.size() && results.size() < MAX_RESULTS; i++) {
452+
// Phase 2: re-score with table boost, collect candidates with boosted scores
453+
List<Map.Entry<Map<String, String>, Float>> candidates = new ArrayList<>();
454+
for (int i = 0; i < scored.size() && candidates.size() < MAX_RESULTS; i++) {
453455
float sim = scored.get(i).getValue();
454456
if (sim < MIN_SIMILARITY) {
455457
break;
@@ -459,17 +461,13 @@ public List<Map<String, String>> query(String queryStr) {
459461
if (entry == null || StringUtils.isBlank(entry.text)) {
460462
continue;
461463
}
462-
// Boost paragraphs that reference discovered tables
463464
if (!relevantTables.isEmpty() && StringUtils.isNotBlank(entry.tables)) {
464465
for (String t : entry.tables.split(" ")) {
465466
if (relevantTables.contains(t)) {
466467
sim += TABLE_BOOST;
467468
}
468469
}
469470
}
470-
// Frontend renders: header + "\n\n" + snippet in Monaco editor
471-
// snippet = SQL/code (used for language detection too)
472-
// header = title + tables + output preview
473471
StringBuilder header = new StringBuilder();
474472
if (StringUtils.isNotBlank(entry.title)) {
475473
header.append(entry.title).append("\n");
@@ -484,14 +482,19 @@ public List<Map<String, String>> query(String queryStr) {
484482
}
485483
header.append("\n").append(out);
486484
}
487-
results.add(ImmutableMap.of(
485+
candidates.add(Map.entry(ImmutableMap.of(
488486
"id", docId,
489487
"name", entry.noteName != null ? entry.noteName : "",
490488
"snippet", entry.text,
491489
"text", entry.text,
492-
"header", header.toString()));
490+
"header", header.toString()), sim));
493491
}
494492
// Re-sort by boosted score
493+
candidates.sort((a, b) -> Float.compare(b.getValue(), a.getValue()));
494+
List<Map<String, String>> results = new ArrayList<>();
495+
for (Map.Entry<Map<String, String>, Float> c : candidates) {
496+
results.add(c.getKey());
497+
}
495498
return results;
496499
}
497500

@@ -657,29 +660,34 @@ static String formatId(String noteId, Paragraph p) {
657660
*/
658661
private void saveIndex() throws IOException {
659662
Path file = indexPath.resolve("embedding_index.bin");
663+
Path tmpFile = indexPath.resolve("embedding_index.bin.tmp");
660664
indexLock.readLock().lock();
661-
try (DataOutputStream out = new DataOutputStream(new FileOutputStream(file.toFile()))) {
662-
out.writeInt(3); // version 3: includes output field
663-
out.writeInt(index.size());
664-
for (Map.Entry<String, IndexEntry> e : index.entrySet()) {
665-
out.writeUTF(e.getKey());
666-
out.writeUTF(e.getValue().noteName != null ? e.getValue().noteName : "");
667-
String text = e.getValue().text != null ? e.getValue().text : "";
668-
if (text.length() > 2000) {
669-
text = text.substring(0, 2000);
670-
}
671-
out.writeUTF(text);
672-
out.writeUTF(e.getValue().title != null ? e.getValue().title : "");
673-
out.writeUTF(e.getValue().tables != null ? e.getValue().tables : "");
674-
String output = e.getValue().output != null ? e.getValue().output : "";
675-
if (output.length() > 1000) {
676-
output = output.substring(0, 1000);
677-
}
678-
out.writeUTF(output);
679-
for (float v : e.getValue().embedding) {
680-
out.writeFloat(v);
665+
try {
666+
try (DataOutputStream out = new DataOutputStream(new FileOutputStream(tmpFile.toFile()))) {
667+
out.writeInt(3); // version 3: includes output field
668+
out.writeInt(index.size());
669+
for (Map.Entry<String, IndexEntry> e : index.entrySet()) {
670+
out.writeUTF(e.getKey());
671+
out.writeUTF(e.getValue().noteName != null ? e.getValue().noteName : "");
672+
String text = e.getValue().text != null ? e.getValue().text : "";
673+
if (text.length() > 2000) {
674+
text = text.substring(0, 2000);
675+
}
676+
out.writeUTF(text);
677+
out.writeUTF(e.getValue().title != null ? e.getValue().title : "");
678+
out.writeUTF(e.getValue().tables != null ? e.getValue().tables : "");
679+
String output = e.getValue().output != null ? e.getValue().output : "";
680+
if (output.length() > 1000) {
681+
output = output.substring(0, 1000);
682+
}
683+
out.writeUTF(output);
684+
for (float v : e.getValue().embedding) {
685+
out.writeFloat(v);
686+
}
681687
}
682688
}
689+
Files.move(tmpFile, file, java.nio.file.StandardCopyOption.REPLACE_EXISTING,
690+
java.nio.file.StandardCopyOption.ATOMIC_MOVE);
683691
} finally {
684692
indexLock.readLock().unlock();
685693
}

zeppelin-zengine/src/test/java/org/apache/zeppelin/search/EmbeddingSearchTest.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@
6464
@EnabledIfEnvironmentVariable(named = "ZEPPELIN_EMBEDDING_TEST", matches = "true")
6565
class EmbeddingSearchTest {
6666

67+
/** Shared model directory — avoids re-downloading 86MB model per test method. */
68+
private static File sharedModelDir;
69+
6770
private Notebook notebook;
6871
private InterpreterSettingManager interpreterSettingManager;
6972
private NoteManager noteManager;
@@ -72,7 +75,13 @@ class EmbeddingSearchTest {
7275

7376
@BeforeEach
7477
public void startUp() throws IOException {
78+
if (sharedModelDir == null) {
79+
sharedModelDir = Files.createTempDirectory("EmbeddingSearchTest-models").toFile();
80+
}
7581
indexDir = Files.createTempDirectory(this.getClass().getSimpleName()).toFile();
82+
// Copy shared model dir path so model is cached across tests
83+
File modelsLink = new File(indexDir, "models");
84+
Files.createSymbolicLink(modelsLink.toPath(), sharedModelDir.toPath());
7685
ZeppelinConfiguration zConf = ZeppelinConfiguration.load();
7786
zConf.setProperty(ZeppelinConfiguration.ConfVars.ZEPPELIN_SEARCH_INDEX_PATH.getVarName(),
7887
indexDir.getAbsolutePath());

0 commit comments

Comments
 (0)