Skip to content

✨🌐 Added language-specific handling to search for en, fr, de #23122

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion apps/sodo-search/src/App.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ export default class App extends React.Component {
const searchIndex = new SearchIndex({
adminUrl: props.adminUrl,
apiKey: props.apiKey,
dir: dir
dir: dir,
locale: i18nLanguage
});

this.state = {
Expand Down
48 changes: 39 additions & 9 deletions apps/sodo-search/src/search-index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import Flexsearch, {Charset} from 'flexsearch';
import GhostContentAPI from '@tryghost/content-api';
import EnglishPreset from 'flexsearch/lang/en';
import FrenchPreset from 'flexsearch/lang/fr';
import GermanPreset from 'flexsearch/lang/de';

const cjkEncoderPresetCodepoint = {
finalize: (terms) => {
Expand Down Expand Up @@ -36,7 +39,7 @@ export function tokenizeCjkByCodePoint(text) {

if (isCJK(codePoint)) {
if (buffer) {
result.push(buffer); // Push any non-CJK word weve been building
result.push(buffer); // Push any non-CJK word we've been building
buffer = '';
}
result.push(char); // Push the CJK char as its own token
Expand All @@ -52,20 +55,45 @@ export function tokenizeCjkByCodePoint(text) {
return result;
}

const encoderSet = new Flexsearch.Encoder(
Charset.Default,
cjkEncoderPresetCodepoint
);
const chooseEncoder = (locale) => {
switch (locale) {
case 'en':
return new Flexsearch.Encoder(
Charset.Default,
EnglishPreset,
cjkEncoderPresetCodepoint
);
case 'fr':
return new Flexsearch.Encoder(
Charset.Default,
FrenchPreset,
cjkEncoderPresetCodepoint
);
case 'de':
return new Flexsearch.Encoder(
Charset.Default,
GermanPreset,
cjkEncoderPresetCodepoint
);
default:
return new Flexsearch.Encoder(
Charset.Default,
cjkEncoderPresetCodepoint
);
}
};

export default class SearchIndex {
constructor({adminUrl, apiKey, dir}) {
constructor({adminUrl, apiKey, dir, locale}) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Verify the locale value format

Ensure that the locale value passed from App.js is compatible with the switch statement in chooseEncoder. Currently, it expects two-letter language codes ('en', 'fr', 'de'), but browser-provided locale strings might include region codes (e.g., 'en-US', 'fr-FR').


🏁 Script executed:

#!/bin/bash
# Check for any locale handling code that might normalize locale values
rg -A 2 -B 2 "locale.*=|=.*locale" --glob "*.js" --glob "*.ts"

Length of output: 45616


I’d like to inspect the chooseEncoder implementation and constructor in apps/sodo-search/src/search-index.js to confirm there’s no locale normalization before the switch:


🏁 Script executed:

#!/bin/bash
# Show the chooseEncoder function and constructor in search-index.js
rg -n "chooseEncoder" -A10 -B2 apps/sodo-search/src/search-index.js

Length of output: 1427


Normalize locale to two-letter codes before choosing an encoder

The locale passed into chooseEncoder comes directly from the script tag (e.g. "en-US") but the switch only matches exact two-letter codes ('en', 'fr', etc.). You’ll need to strip off any region subtags (and lowercase) so you always pass a supported code:

• File: apps/sodo-search/src/search-index.js
– Constructor (lines ~98–102) currently does:
js this.postsIndex = new Flexsearch.Document({ …, encoder: chooseEncoder(locale) });
chooseEncoder (lines 57–67) only handles exact matches like 'en' and 'fr'.

Suggested change:

   constructor({adminUrl, apiKey, dir, locale}) {
-    const encoder = chooseEncoder(locale);
+    const lang = locale.split(/[-_]/)[0].toLowerCase();
+    const encoder = chooseEncoder(lang);

    this.postsIndex = new Flexsearch.Document({
      …,
      encoder
    });
    // repeat for authorsIndex, tagsIndex

This ensures "en-US", "fr-FR", etc. fall back to your two-letter presets.

this.api = new GhostContentAPI({
url: adminUrl,
key: apiKey,
version: 'v5.0'
});
const rtl = (dir === 'rtl');
const tokenize = (dir === 'rtl') ? 'reverse' : 'forward';
const encoder = chooseEncoder(locale);

this.postsIndex = new Flexsearch.Document({
tokenize: tokenize,
rtl: rtl,
Expand All @@ -74,8 +102,9 @@ export default class SearchIndex {
index: ['title', 'excerpt'],
store: true
},
encoder: encoderSet
encoder: encoder
});

this.authorsIndex = new Flexsearch.Document({
tokenize: tokenize,
rtl: rtl,
Expand All @@ -84,8 +113,9 @@ export default class SearchIndex {
index: ['name'],
store: true
},
encoder: encoderSet
encoder: encoder
});

this.tagsIndex = new Flexsearch.Document({
tokenize: tokenize,
rtl: rtl,
Expand All @@ -94,7 +124,7 @@ export default class SearchIndex {
index: ['name'],
store: true
},
encoder: encoderSet
encoder: encoder
});

this.init = this.init.bind(this);
Expand Down
156 changes: 156 additions & 0 deletions apps/sodo-search/src/search-index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -429,4 +429,160 @@ describe('search index', function () {
searchResults = searchIndex.search('Baklava');
expect(searchResults.posts.length).toEqual(0); // because search isn't magic
});

// These tests illustrate differences in stemming between languages en and de.

test('stemming is language-specific - english', async () => {
const adminUrl = 'http://localhost:3000';
const apiKey = '69010382388f9de5869ad6e558';
const searchIndex = new SearchIndex({adminUrl, apiKey, storage: localStorage, locale: 'en'});

nock('http://localhost:3000/ghost/api/content')
.get('/posts/?key=69010382388f9de5869ad6e558&limit=10000&fields=id%2Cslug%2Ctitle%2Cexcerpt%2Curl%2Cupdated_at%2Cvisibility&order=updated_at%20DESC')
.reply(200, {
posts: [{
id: 'post',
title: 'des Mannes',
url: 'http://localhost/ghost/des-mannes/'
}, {
id: 'post2',
title: 'dem Mann',
url: 'http://localhost/ghost/dem-mann/'
}, {
id: 'post4',
title: 'Running',
url: 'http://localhost/ghost/running/'
}, {
id: 'post5',
title: 'Run',
url: 'http://localhost/ghost/run/'
}]
});

nock('http://localhost:3000/ghost/api/content')
.get('/authors/?key=69010382388f9de5869ad6e558&limit=10000&fields=id%2Cslug%2Cname%2Curl%2Cprofile_image&order=updated_at%20DESC')
.reply(200, {authors: []});

nock('http://localhost:3000/ghost/api/content')
.get('/tags/?key=69010382388f9de5869ad6e558&limit=10000&fields=id%2Cslug%2Cname%2Curl&order=updated_at%20DESC&filter=visibility%3Apublic')
.reply(200, {tags: []});

await searchIndex.init();

let searchResults = searchIndex.search('running');
expect(searchResults.posts.length).toEqual(2);

searchResults = searchIndex.search('run');
expect(searchResults.posts.length).toEqual(2);

searchResults = searchIndex.search('des mannes');
expect(searchResults.posts.length).toEqual(1);
expect(searchResults.posts[0].url).toEqual('http://localhost/ghost/des-mannes/');

searchResults = searchIndex.search('dem mann');
expect(searchResults.posts.length).toEqual(1);
expect(searchResults.posts[0].url).toEqual('http://localhost/ghost/dem-mann/');
});
test('stemming is language-specific - german', async () => {
const adminUrl = 'http://localhost:3000';
const apiKey = '69010382388f9de5869ad6e558';
const searchIndex = new SearchIndex({adminUrl, apiKey, storage: localStorage, locale: 'de'});

nock('http://localhost:3000/ghost/api/content')
.get('/posts/?key=69010382388f9de5869ad6e558&limit=10000&fields=id%2Cslug%2Ctitle%2Cexcerpt%2Curl%2Cupdated_at%2Cvisibility&order=updated_at%20DESC')
.reply(200, {
posts: [{
id: 'post',
title: 'des Mannes',
url: 'http://localhost/ghost/des-mannes/'
}, {
id: 'post2',
title: 'dem Mann',
url: 'http://localhost/ghost/dem-mann/'
},
{
id: 'post4',
title: 'Running',
url: 'http://localhost/ghost/running/'
}, {
id: 'post5',
title: 'Run',
url: 'http://localhost/ghost/run/'
}]
});

nock('http://localhost:3000/ghost/api/content')
.get('/authors/?key=69010382388f9de5869ad6e558&limit=10000&fields=id%2Cslug%2Cname%2Curl%2Cprofile_image&order=updated_at%20DESC')
.reply(200, {authors: []});

nock('http://localhost:3000/ghost/api/content')
.get('/tags/?key=69010382388f9de5869ad6e558&limit=10000&fields=id%2Cslug%2Cname%2Curl&order=updated_at%20DESC&filter=visibility%3Apublic')
.reply(200, {tags: []});

await searchIndex.init();

let searchResults = searchIndex.search('running');
expect(searchResults.posts.length).toEqual(1);
expect(searchResults.posts[0].url).toEqual('http://localhost/ghost/running/');

searchResults = searchIndex.search('run');
expect(searchResults.posts.length).toEqual(2);

searchResults = searchIndex.search('des mannes');
expect(searchResults.posts.length).toEqual(2);

searchResults = searchIndex.search('dem mann');
expect(searchResults.posts.length).toEqual(2);
});
test('no language-specific stemming with an unsupported locale', async () => {
const adminUrl = 'http://localhost:3000';
const apiKey = '69010382388f9de5869ad6e558';
const searchIndex = new SearchIndex({adminUrl, apiKey, storage: localStorage, locale: 'vt'});

nock('http://localhost:3000/ghost/api/content')
.get('/posts/?key=69010382388f9de5869ad6e558&limit=10000&fields=id%2Cslug%2Ctitle%2Cexcerpt%2Curl%2Cupdated_at%2Cvisibility&order=updated_at%20DESC')
.reply(200, {
posts: [{
id: 'post',
title: 'des Mannes',
url: 'http://localhost/ghost/des-mannes/'
}, {
id: 'post2',
title: 'dem Mann',
url: 'http://localhost/ghost/dem-mann/'
},
{
id: 'post4',
title: 'Running',
url: 'http://localhost/ghost/running/'
}, {
id: 'post5',
title: 'Run',
url: 'http://localhost/ghost/run/'
}]
});

nock('http://localhost:3000/ghost/api/content')
.get('/authors/?key=69010382388f9de5869ad6e558&limit=10000&fields=id%2Cslug%2Cname%2Curl%2Cprofile_image&order=updated_at%20DESC')
.reply(200, {authors: []});

nock('http://localhost:3000/ghost/api/content')
.get('/tags/?key=69010382388f9de5869ad6e558&limit=10000&fields=id%2Cslug%2Cname%2Curl&order=updated_at%20DESC&filter=visibility%3Apublic')
.reply(200, {tags: []});

await searchIndex.init();

let searchResults = searchIndex.search('running');
expect(searchResults.posts.length).toEqual(1);
expect(searchResults.posts[0].url).toEqual('http://localhost/ghost/running/');

searchResults = searchIndex.search('run');
expect(searchResults.posts.length).toEqual(2);

searchResults = searchIndex.search('des mannes');
expect(searchResults.posts.length).toEqual(1);

searchResults = searchIndex.search('dem mann');
expect(searchResults.posts.length).toEqual(1);
});
});
1 change: 1 addition & 0 deletions apps/sodo-search/vite.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ export default defineConfig((config) => {
exclude: []
},
optimizeDeps: {
include: ['flexsearch', 'flexsearch/lang/en', 'flexsearch/lang/fr', 'flexsearch/lang/de'],
esbuildOptions: {
plugins: [
{
Expand Down