Skip to content

Commit 1034587

Browse files
committed
Evaluate Orama DB
1 parent a8ac573 commit 1034587

10 files changed

+1961
-186
lines changed

package-lock.json

+1,662-137
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

+6-2
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,9 @@
6767
],
6868
"dependencies": {
6969
"@nyariv/sandboxjs": "0.8.23",
70-
"@opentelemetry/core": "1.28.0",
7170
"@opentelemetry/api": "1.3.0",
7271
"@opentelemetry/context-zone": "1.8.0",
72+
"@opentelemetry/core": "1.28.0",
7373
"@opentelemetry/exporter-zipkin": "1.8.0",
7474
"@opentelemetry/instrumentation": "0.33.0",
7575
"@opentelemetry/instrumentation-fetch": "0.34.0",
@@ -79,6 +79,9 @@
7979
"@opentelemetry/sdk-trace-node": "1.8.0",
8080
"@opentelemetry/sdk-trace-web": "1.8.0",
8181
"@opentelemetry/semantic-conventions": "1.8.0",
82+
"@orama/orama": "3.1.1",
83+
"@tensorflow/tfjs-node": "4.22.0",
84+
"@tensorflow-models/universal-sentence-encoder": "1.3.3",
8285
"async": "3.2.3",
8386
"buffer": "^5.7.1",
8487
"casual": "1.6.2",
@@ -94,8 +97,8 @@
9497
"express": "4.21.1",
9598
"express-jwt": "8.2.1",
9699
"express-rate-limit": "^6.6.0",
97-
"htmlparser2": "9.0.0",
98100
"file-isignature": "1.0.3",
101+
"htmlparser2": "9.0.0",
99102
"js-yaml": "4.1.0",
100103
"jsonwebtoken": "9.0.2",
101104
"jszip": "3.10.1",
@@ -105,6 +108,7 @@
105108
"marked": "9.0.2",
106109
"mathjs": "10.5.0",
107110
"mathml-to-latex": "1.4.0",
111+
"@msgpack/msgpack": "3.1.0",
108112
"minimist": "1.2.6",
109113
"mitt": "^3.0.0",
110114
"open": "^7.4.2",

src/containers/job/JobManagerContainer.ts

+8-2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import {FolderRegistryContainer} from '../folder_registry/FolderRegistryContaine
1818
import {ActionRunnerContainer, convertActionYaml} from '../action/ActionRunnerContainer.ts';
1919
import {getContentFileService} from '../transform/utils.ts';
2020
import {UploadContainer} from '../google_folder/UploadContainer.ts';
21+
import {createIndexer} from '../search/Indexer.ts';
2122

2223
const __filename = import.meta.filename;
2324

@@ -747,13 +748,15 @@ export class JobManagerContainer extends Container {
747748
const contentFileService = await getContentFileService(transformedFileSystem, userConfigService);
748749
const markdownTreeProcessor = new MarkdownTreeProcessor(contentFileService);
749750

751+
const indexer = await createIndexer();
752+
750753
switch (type) {
751754
case 'local':
752755
await gitScanner.resetToLocal({
753756
privateKeyFile: await userConfigService.getDeployPrivateKeyPath()
754757
});
755758

756-
await markdownTreeProcessor.regenerateTree(driveId);
759+
await markdownTreeProcessor.regenerateTree(driveId, indexer);
757760
await markdownTreeProcessor.save();
758761
break;
759762
case 'remote':
@@ -762,12 +765,15 @@ export class JobManagerContainer extends Container {
762765
privateKeyFile: await userConfigService.getDeployPrivateKeyPath()
763766
});
764767

765-
await markdownTreeProcessor.regenerateTree(driveId);
768+
await markdownTreeProcessor.regenerateTree(driveId, indexer);
766769
await markdownTreeProcessor.save();
767770
}
768771
break;
769772
}
770773

774+
await transformedFileSystem.mkdir('/.private');
775+
await transformedFileSystem.writeBuffer('/.private/' + indexer.getFileName(), await indexer.getData());
776+
771777
await this.schedule(driveId, {
772778
...initJob(),
773779
type: 'run_action',

src/containers/search/Indexer.ts

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// import {LunrIndexer} from './LunrIndexer.ts';
2+
import {OramaIndexer} from './OramaIndexer.ts';
3+
4+
export interface SearchResults {
5+
result: any[];
6+
}
7+
8+
export interface PageToIndex {
9+
content: string;
10+
id: string;
11+
title: string;
12+
path: string;
13+
}
14+
15+
export interface Indexer {
16+
addPage(page: PageToIndex): Promise<void>;
17+
getData(): Promise<Uint8Array>;
18+
setData(data: Uint8Array): Promise<void>;
19+
search(term: string): Promise<SearchResults>;
20+
getFileName(): string;
21+
}
22+
23+
export function createIndexer() {
24+
return OramaIndexer.init();
25+
}

src/containers/search/LunrIndexer.ts

+62-7
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,34 @@
11
import lunr from 'lunr';
22
import stemmerSupport from 'lunr-languages/lunr.stemmer.support.js';
3+
import { Indexer, PageToIndex } from './Indexer.ts';
34

45
stemmerSupport(lunr);
56

67
lunr.tokenizer.separator = /[\s-_@#,.]+/;
78

8-
export class LunrIndexer {
9+
export class LunrIndexer implements Indexer {
910
private lunrBuilder: lunr.Builder;
10-
private store;
11+
private store: Record<string, any>;
12+
lunrIndex: lunr.Index;
1113

1214
constructor() {
1315
this.lunrBuilder = new lunr.Builder();
1416
this.lunrBuilder.ref('path');
15-
this.lunrBuilder.field('id2', { extractor: (doc) => {
17+
this.lunrBuilder.field('id2', {
18+
extractor: (doc) => {
1619
return doc['id'] ? doc['id'].replace(/[_-]*/g, '') : undefined;
17-
}});
20+
}
21+
});
1822
this.lunrBuilder.field('id');
1923
this.lunrBuilder.field('title');
2024
this.store = {};
2125
}
2226

23-
async addPage(page) {
27+
static async init() {
28+
return new LunrIndexer();
29+
}
30+
31+
async addPage(page: PageToIndex) {
2432
this.lunrBuilder.add({
2533
path: page.path,
2634
title: page.title,
@@ -33,9 +41,56 @@ export class LunrIndexer {
3341
};
3442
}
3543

36-
getJson() {
44+
async getData() {
3745
const lunrIndex = this.lunrBuilder.build();
38-
return { index: lunrIndex.toJSON(), store: this.store };
46+
const str = JSON.stringify({ index: lunrIndex.toJSON(), store: this.store });
47+
return new TextEncoder().encode(str);
48+
}
49+
50+
async setData(data: Uint8Array) {
51+
this.lunrIndex = undefined;
52+
this.store = {};
53+
54+
try {
55+
const lunrData = JSON.parse(new TextDecoder().decode(data));
56+
this.store = lunrData.store || {};
57+
if (lunrData?.index) {
58+
this.lunrIndex = lunr.Index.load(lunrData.index);
59+
}
60+
// deno-lint-ignore no-unused-vars
61+
} catch (err) {
62+
this.store = {};
63+
}
3964
}
4065

66+
async search(queryParam: string) {
67+
if (!this.lunrIndex) {
68+
return {
69+
result: []
70+
};
71+
}
72+
73+
queryParam = (queryParam || '').trim().replace(/:/g, ' ');
74+
75+
let result = this.lunrIndex.search(queryParam);
76+
if (result.length === 0 && queryParam.indexOf('*') === -1) {
77+
result = this.lunrIndex.search(queryParam.split(/\s+/g).map(w => w.length > 2 ? w + '*' : w).join(' '));
78+
}
79+
if (result.length === 0 && queryParam.replace(/[_-]*/g, '').length > 10) {
80+
result = this.lunrIndex.search(queryParam.replace(/[_-]*/g, ''));
81+
}
82+
83+
return {
84+
result: result.map((doc) => ({
85+
path: doc.ref,
86+
score: doc.score,
87+
matchData: doc.matchData,
88+
...this.store[doc.ref]
89+
}))
90+
};
91+
}
92+
93+
getFileName() {
94+
return 'lonr.json';
95+
}
4196
}

src/containers/search/OramaIndexer.ts

+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import { create, load, save, insert, search } from '@orama/orama';
2+
import type { Orama, RawData } from '@orama/orama';
3+
import msgpack from '@msgpack/msgpack';
4+
import '@tensorflow/tfjs-node'; // Or any other appropriate TensorflowJS backend, like @tensorflow/tfjs-backend-webgl
5+
6+
import { pluginEmbeddings } from './pluginEmbeddings.ts';
7+
8+
import {Indexer, SearchResults, PageToIndex} from './Indexer.ts';
9+
10+
const plugin = await pluginEmbeddings({
11+
embeddings: {
12+
defaultProperty: 'embeddings',
13+
onInsert: {
14+
generate: true,
15+
properties: ['title', 'content']
16+
}
17+
}
18+
});
19+
20+
export class OramaIndexer implements Indexer {
21+
private constructor(private db: Orama<unknown, unknown, unknown, unknown>) {
22+
}
23+
24+
static async init() {
25+
const db = create({
26+
schema: {
27+
title: 'string',
28+
id: 'string',
29+
path: 'string',
30+
embeddings: 'vector[512]',
31+
},
32+
plugins: [plugin]
33+
});
34+
35+
return new OramaIndexer(db);
36+
}
37+
38+
async addPage(page: PageToIndex): Promise<void> {
39+
insert(this.db, {
40+
title: page.title,
41+
id: page.id,
42+
path: page.path,
43+
content: page.content
44+
});
45+
}
46+
47+
async search(term: string): Promise<SearchResults> {
48+
const results = await search(this.db, {
49+
term,
50+
mode: 'vector',
51+
similarity: 0.5,
52+
});
53+
54+
return {
55+
result: results.hits.map(h => ({
56+
score: h.score,
57+
id: h?.document?.id,
58+
path: h?.document?.path,
59+
title: h?.document?.title,
60+
}))
61+
};
62+
}
63+
64+
async setData(data: Uint8Array): Promise<void> {
65+
load(this.db, <RawData>msgpack.decode(data));
66+
}
67+
68+
async getData(): Promise<Uint8Array> {
69+
return msgpack.encode(save(this.db));
70+
}
71+
72+
getFileName() {
73+
return 'orama.msgpack';
74+
}
75+
}
+89
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import type { AnyOrama, SearchParams, TypedDocument, OramaPluginAsync, PartialSchemaDeep } from '@orama/orama'
2+
import { load as loadModel } from '@tensorflow-models/universal-sentence-encoder'
3+
4+
export type PluginEmbeddingsParams = {
5+
embeddings: {
6+
defaultProperty: string
7+
onInsert?: {
8+
generate: boolean
9+
properties: string[]
10+
verbose?: boolean
11+
}
12+
}
13+
}
14+
15+
function getPropertyValue (obj: object, path: string) {
16+
return path.split('.').reduce((current, key) =>
17+
current && current[key] !== undefined ? current[key] : undefined, obj
18+
);
19+
}
20+
21+
function getPropertiesValues(schema: object, properties: string[]) {
22+
return properties
23+
.map(prop => getPropertyValue(schema, prop))
24+
.filter(value => value !== undefined)
25+
.join('. ');
26+
}
27+
28+
function normalizeVector(v: number[]): number[] {
29+
const norm = Math.sqrt(v.reduce((sum, val) => sum + val * val, 0));
30+
return v.map(val => val / norm);
31+
}
32+
33+
export const embeddingsType = 'vector[512]';
34+
35+
export async function pluginEmbeddings(pluginParams: PluginEmbeddingsParams): Promise<OramaPluginAsync> {
36+
const model = await loadModel();
37+
38+
return {
39+
name: 'orama-plugin-embeddings',
40+
41+
async beforeInsert<T extends TypedDocument<any>>(_db: AnyOrama, _id: string, params: PartialSchemaDeep<T>) {
42+
if (!pluginParams.embeddings?.onInsert?.generate) {
43+
return;
44+
}
45+
46+
if (!pluginParams.embeddings?.onInsert?.properties) {
47+
throw new Error('Missing "embeddingsConfig.properties" parameter for plugin-secure-proxy');
48+
}
49+
50+
const properties = pluginParams.embeddings.onInsert.properties;
51+
const values = getPropertiesValues(params, properties);
52+
53+
if (pluginParams.embeddings.onInsert.verbose) {
54+
console.log(`Generating embeddings for properties "${properties.join(', ')}": "${values}"`);
55+
}
56+
57+
const embeddings = Array.from(await (await model.embed(values)).data());
58+
59+
params[pluginParams.embeddings.defaultProperty] = normalizeVector(embeddings);
60+
},
61+
62+
async beforeSearch<T extends AnyOrama>(_db: AnyOrama, params: SearchParams<T, TypedDocument<any>>) {
63+
if (params.mode !== 'vector' && params.mode !== 'hybrid') {
64+
return;
65+
}
66+
67+
if (params?.vector?.value) {
68+
return;
69+
}
70+
71+
if (!params.term) {
72+
throw new Error('No "term" or "vector" parameters were provided');
73+
}
74+
75+
const embeddings = Array.from(await (await model.embed(params.term)).data()) as unknown as number[];
76+
77+
if (!params.vector) {
78+
params.vector = {
79+
// eslint-disable-next-line
80+
// @ts-ignore
81+
property: params?.vector?.property ?? pluginParams.embeddings.defaultProperty,
82+
value: normalizeVector(embeddings)
83+
};
84+
}
85+
86+
params.vector.value = normalizeVector(embeddings);
87+
}
88+
};
89+
}

0 commit comments

Comments
 (0)