Skip to content

Commit 16674c6

Browse files
committed
Add support for integrated suggest-as-you-type suggestions.
This patch aims to remove the govarnam dependency for search suggestions. - Introduces a new trie-based in-memory word search mechanism which is populated using all words in the DB during load. Configurable with the `[suggestions]` config block. This works for all languages that are entered in the input box in their native Unicode forms. If no suggestions are found, it falls back to fulltext search on the DB. - For languages (eg: Malayalam, Kannada) where the input can be Latin characters denoting phonetic strings (eg: "amma"), search + suggestions still work thanks to the new transliteration algorithm shipped in the bundled luka tokenizers. This works for any language. As long as tokenize() in the Lua tokenizers return tokens, whatever the input maybe. So, this patch introduces transliteration mapping for generating IndicPhone hashes for "Manglish" and "Kanglish" latin char phonetic inputs.
1 parent 60d7258 commit 16674c6

File tree

16 files changed

+711
-13
lines changed

16 files changed

+711
-13
lines changed

Cargo.lock

Lines changed: 48 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ bytes = "1"
4040
bincode = "1"
4141
rmp-serde = "1"
4242
md5 = "0.8"
43+
trie-rs = "0.4.2"
4344

4445
# Use mimalloc for musl builds (musl's default malloc is very slow).
4546
[target.'cfg(target_env = "musl")'.dependencies]

config.sample.toml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,15 @@ max_disk_mb = 512
6565
dir = "/tmp/dictpress-cache"
6666

6767

68+
[suggestions]
69+
# Enable autocomplete suggestions for search queries on the frontend.
70+
# This loads all dictionary words into memory for fast lookup.
71+
enabled = false
72+
73+
# Number of suggestions to return.
74+
num = 10
75+
76+
6877
################################################################################
6978

7079

site/static/main.js

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -265,14 +265,13 @@ async function screenshotElement(element) {
265265
clearTimeout(debounce);
266266
return new Promise(resolve => {
267267
debounce = setTimeout(async () => {
268-
const response = await fetch(`${_ROOT_URL}/atl/${langCode}/${val.toLowerCase()}`);
269-
const data = await response.json();
268+
const response = await fetch(`${_ROOT_URL}/api/suggestions/${langCode}/${val}`);
269+
const data = await response.json();
270270

271-
const a = data.greedy_tokenized.map(item => item.word).slice(0, 3).sort((a, b) => a.length - b.length);
272-
const b = data.dictionary_suggestions.map(item => item.word).slice(0, 6).sort((a, b) => a.length - b.length);
271+
const suggestions = data.data.map(item => item.content[0]);
273272

274-
debounce = null;
275-
resolve([...new Set(a.concat(b))]);
273+
debounce = null;
274+
resolve(suggestions);
276275
}, 50);
277276
});
278277
},

src/handlers/mod.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ use crate::{
2020
cache::Cache,
2121
manager::Manager,
2222
models::{Dicts, LangMap},
23+
suggestions::Suggestions,
2324
};
2425

2526
pub type I18n = HashMap<String, String>;
@@ -30,6 +31,7 @@ pub struct Ctx {
3031
pub langs: LangMap,
3132
pub dicts: Dicts,
3233
pub cache: Option<Arc<Cache>>,
34+
pub suggestions: Option<Arc<Suggestions>>,
3335

3436
/// Admin templates (always loaded, embedded in binary).
3537
pub admin_tpl: Arc<Tera>,
@@ -72,6 +74,10 @@ pub struct Consts {
7274
pub glossary_max_per_page: i32,
7375
pub glossary_num_page_nums: i32,
7476

77+
// Suggestions settings.
78+
pub suggestions_enabled: bool,
79+
pub num_suggestions: i32,
80+
7581
// Admin assets split by type for easier template rendering.
7682
pub admin_js_assets: Vec<String>,
7783
pub admin_css_assets: Vec<String>,
@@ -100,6 +106,9 @@ impl Default for Consts {
100106
glossary_max_per_page: 100,
101107
glossary_num_page_nums: 10,
102108

109+
suggestions_enabled: false,
110+
num_suggestions: 10,
111+
103112
admin_js_assets: Vec::new(),
104113
admin_css_assets: Vec::new(),
105114
}

src/handlers/search.rs

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ use axum::{
77

88
use super::{json, paginate, total_pages, ApiErr, ApiResp, Ctx, Result};
99
use crate::cache::make_search_cache_key;
10-
use crate::models::{RelationsQuery, SearchQuery, SearchResults, STATUS_ENABLED};
10+
use crate::models::{
11+
RelationsQuery, SearchQuery, SearchResults, StringArray, Suggestion, STATUS_ENABLED,
12+
};
1113

1214
/// Search a dictionary with query in path (public API).
1315
pub async fn search(
@@ -158,3 +160,52 @@ pub async fn do_search(ctx: Arc<Ctx>, query: SearchQuery, is_admin: bool) -> Res
158160

159161
Ok(results)
160162
}
163+
164+
/// Suggestions endpoint for search word autocomplete.
165+
pub async fn get_suggestions(
166+
State(ctx): State<Arc<Ctx>>,
167+
Path((lang, q)): Path<(String, String)>,
168+
) -> Result<ApiResp<Vec<Suggestion>>> {
169+
if q.is_empty() {
170+
return Err(ApiErr::new("`q` is required", StatusCode::BAD_REQUEST));
171+
}
172+
if !ctx.langs.contains_key(&lang) {
173+
return Err(ApiErr::new("unknown language", StatusCode::BAD_REQUEST));
174+
}
175+
176+
// If suggestions are disable, return an empty array.
177+
if !ctx.consts.suggestions_enabled {
178+
return Ok(json(Vec::new()));
179+
}
180+
181+
let limit = ctx.consts.num_suggestions;
182+
183+
// Try trie search first if suggestions are enabled.
184+
let mut out: Vec<Suggestion> = if let Some(sugg) = &ctx.suggestions {
185+
sugg.query(&lang, &q, limit as usize)
186+
.into_iter()
187+
.map(|w| Suggestion {
188+
content: StringArray(vec![w]),
189+
})
190+
.collect()
191+
} else {
192+
Vec::new()
193+
};
194+
195+
// If there are fewer than limit results, supplement with DB FTS search.
196+
if out.len() < limit as usize {
197+
let remaining = limit - out.len() as i32;
198+
if let Ok(res) = ctx.mgr.get_suggestions(&lang, &q, remaining).await {
199+
for s in res {
200+
if !out.iter().any(|r| r.content.0 == s.content.0) {
201+
out.push(s);
202+
if out.len() >= limit as usize {
203+
break;
204+
}
205+
}
206+
}
207+
}
208+
}
209+
210+
Ok(json(out))
211+
}

src/http.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ pub fn init_handlers(ctx: Arc<Ctx>) -> Router {
3535
.route(
3636
"/api/dictionary/entries/{guid}",
3737
get(entries::get_entry_by_guid),
38-
);
38+
)
39+
.route("/api/suggestions/{lang}/{q}", get(search::get_suggestions));
3940

4041
// Public submission routes (if enabled).
4142
let submit_routes = Router::new()

src/init.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
use crate::cache::{Cache, CacheConfig, CacheError};
2+
use crate::manager::Manager;
23
use crate::models::{Config, Dicts, Lang, LangMap, DEFAULT_TOKENIZER};
4+
use crate::suggestions::Suggestions;
35
use crate::tokenizer::Tokenizers;
46

57
/// Initialize logger.
@@ -182,3 +184,20 @@ pub async fn cache(cfg: &CacheConfig) -> Result<Cache, CacheError> {
182184
);
183185
Cache::new(cfg).await
184186
}
187+
188+
/// Initialize suggestions trie from database.
189+
pub async fn suggestions(
190+
mgr: &Manager,
191+
langs: &LangMap,
192+
) -> Result<Suggestions, Box<dyn std::error::Error>> {
193+
let mut sugg = Suggestions::new();
194+
195+
for lang_id in langs.keys() {
196+
let words = mgr.get_all_words(lang_id).await?;
197+
let num = words.len();
198+
sugg.build(lang_id, words);
199+
log::info!("suggestions: loaded {} words for {}", num, lang_id);
200+
}
201+
202+
Ok(sugg)
203+
}

src/main.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ mod init;
99
mod manager;
1010
mod models;
1111
mod sitemaps;
12+
mod suggestions;
1213
mod tokenizer;
1314

1415
// Use mimalloc for musl builds (musl's default malloc is very slow).
@@ -226,6 +227,19 @@ async fn main() {
226227
None
227228
};
228229

230+
// Initialize trie suggestions if enabled.
231+
let suggestions = if config.suggestions.enabled {
232+
match init::suggestions(&mgr, &langs).await {
233+
Ok(s) => Some(Arc::new(s)),
234+
Err(e) => {
235+
log::error!("error initializing suggestions module: {}", e);
236+
None
237+
}
238+
}
239+
} else {
240+
None
241+
};
242+
229243
// Preload static files (JS & CSS) for bundling.
230244
let static_files = http::preload_static_files(&cli.site);
231245

@@ -235,6 +249,7 @@ async fn main() {
235249
langs,
236250
dicts,
237251
cache,
252+
suggestions,
238253
admin_tpl,
239254
site_tpl,
240255
site_path: cli.site.clone(),
@@ -263,6 +278,9 @@ async fn main() {
263278
glossary_max_per_page: config.glossary.max_per_page,
264279
glossary_num_page_nums: config.glossary.num_page_nums,
265280

281+
suggestions_enabled: config.suggestions.enabled,
282+
num_suggestions: config.suggestions.num,
283+
266284
// Split admin assets by file extension for template rendering.
267285
admin_js_assets: config
268286
.app

src/manager/mod.rs

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use sqlx::{sqlite::SqlitePool, Row};
55
use crate::{
66
models::{
77
q, Comment, Dicts, Entry, GlossaryWord, LangMap, Relation, RelationsQuery, SearchQuery,
8-
Stats, STATUS_ENABLED,
8+
Stats, Suggestion, STATUS_ENABLED,
99
},
1010
tokenizer::{Tokenizer, TokenizerError, Tokenizers},
1111
};
@@ -116,6 +116,33 @@ impl Manager {
116116
Ok((results, total))
117117
}
118118

119+
/// Get word suggestions for autocomplete.
120+
pub async fn get_suggestions(
121+
&self,
122+
lang: &str,
123+
query: &str,
124+
limit: i32,
125+
) -> Result<Vec<Suggestion>, Error> {
126+
if !self.langs.contains_key(lang) {
127+
return Err(Error::UnknownLang(lang.to_string()));
128+
}
129+
130+
let fts_query = self.to_fts_query(query, lang)?;
131+
if fts_query.trim().is_empty() {
132+
return Err(Error::Validation("invalid search query".to_string()));
133+
}
134+
135+
let results: Vec<Suggestion> = sqlx::query_as(&q.search_words.query)
136+
.bind(lang)
137+
.bind(query)
138+
.bind(&fts_query)
139+
.bind(limit)
140+
.fetch_all(&self.db)
141+
.await?;
142+
143+
Ok(results)
144+
}
145+
119146
/// Load relations for a set of entries.
120147
/// rel_query.max_per_type: 0 = load all relations, >0 = limit relations per type per entry.
121148
/// rel_query.max_content_items: 0 = no truncation, >0 = truncate content array.
@@ -582,4 +609,19 @@ impl Manager {
582609
let stats: Stats = serde_json::from_str(&row.0).unwrap_or_default();
583610
Ok(stats)
584611
}
612+
613+
/// Get all normalized words for a given language (for building suggestions trie).
614+
pub async fn get_all_words(&self, lang: &str) -> Result<Vec<String>, Error> {
615+
let rows: Vec<(String,)> = sqlx::query_as(&q.get_all_words.query)
616+
.bind(lang)
617+
.fetch_all(&self.db)
618+
.await?;
619+
620+
// Filter out empty strings and collect
621+
Ok(rows
622+
.into_iter()
623+
.map(|(s,)| s.trim().to_string())
624+
.filter(|s| !s.is_empty())
625+
.collect())
626+
}
585627
}

0 commit comments

Comments
 (0)