Skip to content

Commit 1e577d6

Browse files
wow-mileyclaude
andauthored
AMPR-153 #462: add on-device knowledge embeddings schema (#471)
Land the W0.5 foundation for on-device knowledge: SQLDelight tables for documents, chunks, and embedding vectors (BLOB-encoded), an FTS5 virtual table for keyword fallback, the pure-Kotlin EmbeddingVector value type (cosine/dot/serialization), and the commonMain KnowledgeStore interface that platform pipelines (W1.8 iOS, W1.9 Android) will implement. Closes #462 Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 7eb624f commit 1e577d6

7 files changed

Lines changed: 897 additions & 0 deletions

File tree

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
package link.socket.ampere.knowledge
2+
3+
import kotlin.math.sqrt
4+
5+
/**
6+
* A dense vector representation of a document chunk produced by an embedding model.
7+
*
8+
* Provides similarity primitives ([cosineSimilarity], [dotProduct]) used by the
9+
* on-device knowledge retrieval pipeline, plus a stable byte serialization
10+
* ([toBlob] / [fromBlob]) for SQLDelight BLOB persistence.
11+
*
12+
* Values use structural equality based on the underlying [FloatArray] contents.
13+
*/
14+
class EmbeddingVector(
15+
val values: FloatArray,
16+
) {
17+
init {
18+
require(values.isNotEmpty()) { "EmbeddingVector must have at least one dimension" }
19+
}
20+
21+
/** Number of dimensions in the vector. */
22+
val dimension: Int get() = values.size
23+
24+
/**
25+
* Sum of element-wise products with [other].
26+
*
27+
* @throws IllegalArgumentException if [dimension] does not match.
28+
*/
29+
fun dotProduct(other: EmbeddingVector): Float {
30+
require(dimension == other.dimension) {
31+
"Dimension mismatch: $dimension vs ${other.dimension}"
32+
}
33+
var sum = 0f
34+
val a = values
35+
val b = other.values
36+
for (i in a.indices) {
37+
sum += a[i] * b[i]
38+
}
39+
return sum
40+
}
41+
42+
/**
43+
* Cosine similarity in `[-1, 1]`. Returns `0` when either vector has zero magnitude.
44+
*
45+
* @throws IllegalArgumentException if [dimension] does not match.
46+
*/
47+
fun cosineSimilarity(other: EmbeddingVector): Float {
48+
require(dimension == other.dimension) {
49+
"Dimension mismatch: $dimension vs ${other.dimension}"
50+
}
51+
var dot = 0f
52+
var magA = 0f
53+
var magB = 0f
54+
val a = values
55+
val b = other.values
56+
for (i in a.indices) {
57+
val x = a[i]
58+
val y = b[i]
59+
dot += x * y
60+
magA += x * x
61+
magB += y * y
62+
}
63+
val denominator = sqrt(magA) * sqrt(magB)
64+
return if (denominator == 0f) 0f else dot / denominator
65+
}
66+
67+
/**
68+
* Serialize to a fixed-size big-endian byte array (`dimension * 4` bytes) for BLOB storage.
69+
*
70+
* Format: each [Float] is encoded via [Float.toRawBits] in big-endian order so the
71+
* payload is portable across all Kotlin Multiplatform targets.
72+
*/
73+
fun toBlob(): ByteArray {
74+
val bytes = ByteArray(values.size * BYTES_PER_FLOAT)
75+
for (i in values.indices) {
76+
val bits = values[i].toRawBits()
77+
val offset = i * BYTES_PER_FLOAT
78+
bytes[offset] = (bits ushr 24).toByte()
79+
bytes[offset + 1] = (bits ushr 16).toByte()
80+
bytes[offset + 2] = (bits ushr 8).toByte()
81+
bytes[offset + 3] = bits.toByte()
82+
}
83+
return bytes
84+
}
85+
86+
override fun equals(other: Any?): Boolean {
87+
if (this === other) return true
88+
if (other !is EmbeddingVector) return false
89+
return values.contentEquals(other.values)
90+
}
91+
92+
override fun hashCode(): Int = values.contentHashCode()
93+
94+
override fun toString(): String = "EmbeddingVector(dimension=$dimension)"
95+
96+
companion object {
97+
private const val BYTES_PER_FLOAT = 4
98+
99+
/**
100+
* Decode a vector previously written by [toBlob].
101+
*
102+
* @throws IllegalArgumentException if [bytes] length is not a positive multiple of 4.
103+
*/
104+
fun fromBlob(bytes: ByteArray): EmbeddingVector {
105+
require(bytes.isNotEmpty() && bytes.size % BYTES_PER_FLOAT == 0) {
106+
"Embedding blob must be a positive multiple of $BYTES_PER_FLOAT bytes, got ${bytes.size}"
107+
}
108+
val floats = FloatArray(bytes.size / BYTES_PER_FLOAT)
109+
for (i in floats.indices) {
110+
val offset = i * BYTES_PER_FLOAT
111+
val bits = (bytes[offset].toInt() and 0xff shl 24) or
112+
(bytes[offset + 1].toInt() and 0xff shl 16) or
113+
(bytes[offset + 2].toInt() and 0xff shl 8) or
114+
(bytes[offset + 3].toInt() and 0xff)
115+
floats[i] = Float.fromBits(bits)
116+
}
117+
return EmbeddingVector(floats)
118+
}
119+
}
120+
}
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
package link.socket.ampere.knowledge
2+
3+
import kotlinx.datetime.Instant
4+
5+
/**
6+
* On-device knowledge primitive: stores documents, chunks them, embeds the chunks,
7+
* and answers retrieval queries over the local index.
8+
*
9+
* Implementations are platform-specific (W1.8 iOS, W1.9 Android) — this commonMain
10+
* surface is the contract those pipelines target.
11+
*
12+
* All work runs on-device. No cloud embedding APIs are invoked from this layer.
13+
*/
14+
interface KnowledgeStore {
15+
16+
/**
17+
* Persist a new document or update an existing one keyed by content hash.
18+
*
19+
* Implementations should be idempotent: importing a document whose
20+
* [KnowledgeDocument.contentHash] already exists must not create duplicate
21+
* chunks or embeddings.
22+
*/
23+
suspend fun addDocument(document: KnowledgeDocument): Result<KnowledgeDocument>
24+
25+
/**
26+
* Split [documentId]'s content into chunks and produce embeddings for each chunk
27+
* using the on-device embedding model identified by [modelId].
28+
*
29+
* @param documentId Document previously persisted via [addDocument].
30+
* @param modelId Identifier of the embedding model (e.g. `"all-MiniLM-L6-v2"`).
31+
* @return The chunks that were embedded, in order.
32+
*/
33+
suspend fun chunkAndEmbed(
34+
documentId: String,
35+
modelId: String,
36+
): Result<List<KnowledgeChunk>>
37+
38+
/**
39+
* Retrieve chunks relevant to [text] using the supplied [mode].
40+
*
41+
* @param text Free-form query.
42+
* @param limit Maximum number of results to return.
43+
* @param mode Retrieval strategy. Defaults to [QueryMode.HYBRID].
44+
*/
45+
suspend fun query(
46+
text: String,
47+
limit: Int = DEFAULT_QUERY_LIMIT,
48+
mode: QueryMode = QueryMode.HYBRID,
49+
): Result<List<KnowledgeQueryResult>>
50+
51+
companion object {
52+
const val DEFAULT_QUERY_LIMIT: Int = 10
53+
}
54+
}
55+
56+
/**
57+
* Strategy for resolving a [KnowledgeStore.query] call.
58+
*/
59+
enum class QueryMode {
60+
/** Cosine-similarity search over [EmbeddingVector]s. */
61+
SEMANTIC,
62+
63+
/** FTS5 keyword search over chunk text. */
64+
KEYWORD,
65+
66+
/** Combine [SEMANTIC] and [KEYWORD] results, blending scores. */
67+
HYBRID,
68+
}
69+
70+
/**
71+
* A document imported into the on-device knowledge store.
72+
*
73+
* @param id Stable identifier.
74+
* @param title Human-readable title.
75+
* @param sourceUri Optional pointer to the original source (file path, URL, etc.).
76+
* @param importedAt When the document was imported.
77+
* @param contentHash Hash of the source content; used for idempotent re-import.
78+
* @param content Raw textual content. May be empty if the implementation lazily
79+
* loads content from [sourceUri].
80+
*/
81+
data class KnowledgeDocument(
82+
val id: String,
83+
val title: String,
84+
val sourceUri: String?,
85+
val importedAt: Instant,
86+
val contentHash: String,
87+
val content: String = "",
88+
)
89+
90+
/**
91+
* A chunk of a [KnowledgeDocument]'s text — the unit at which embeddings are produced
92+
* and retrieval results are returned.
93+
*/
94+
data class KnowledgeChunk(
95+
val id: String,
96+
val documentId: String,
97+
val chunkIndex: Int,
98+
val text: String,
99+
val charStart: Int,
100+
val charEnd: Int,
101+
)
102+
103+
/**
104+
* A single retrieval hit returned by [KnowledgeStore.query].
105+
*
106+
* @param chunk The matching chunk.
107+
* @param score Mode-dependent score (cosine similarity, FTS rank, or hybrid blend).
108+
* Higher scores indicate better matches.
109+
*/
110+
data class KnowledgeQueryResult(
111+
val chunk: KnowledgeChunk,
112+
val score: Float,
113+
)
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
-- On-device knowledge schema: documents, chunks, and embedding vectors.
2+
-- Vectors are stored as raw BLOBs serialized via EmbeddingVector.toBlob().
3+
-- Platform retrieval pipelines (W1.8 iOS, W1.9 Android) target this surface.
4+
5+
CREATE TABLE IF NOT EXISTS knowledge_documents (
6+
id TEXT PRIMARY KEY NOT NULL,
7+
title TEXT NOT NULL,
8+
source_uri TEXT,
9+
imported_at INTEGER NOT NULL,
10+
content_hash TEXT NOT NULL
11+
);
12+
13+
CREATE INDEX IF NOT EXISTS idx_knowledge_documents_imported_at
14+
ON knowledge_documents(imported_at DESC);
15+
16+
CREATE INDEX IF NOT EXISTS idx_knowledge_documents_content_hash
17+
ON knowledge_documents(content_hash);
18+
19+
CREATE TABLE IF NOT EXISTS knowledge_chunks (
20+
id TEXT PRIMARY KEY NOT NULL,
21+
document_id TEXT NOT NULL,
22+
chunk_index INTEGER NOT NULL,
23+
text TEXT NOT NULL,
24+
char_start INTEGER NOT NULL,
25+
char_end INTEGER NOT NULL,
26+
FOREIGN KEY (document_id) REFERENCES knowledge_documents(id) ON DELETE CASCADE,
27+
UNIQUE (document_id, chunk_index)
28+
);
29+
30+
CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_document_id
31+
ON knowledge_chunks(document_id);
32+
33+
CREATE TABLE IF NOT EXISTS knowledge_embeddings (
34+
chunk_id TEXT NOT NULL,
35+
model_id TEXT NOT NULL,
36+
vector_blob BLOB NOT NULL,
37+
created_at INTEGER NOT NULL,
38+
PRIMARY KEY (chunk_id, model_id),
39+
FOREIGN KEY (chunk_id) REFERENCES knowledge_chunks(id) ON DELETE CASCADE
40+
);
41+
42+
CREATE INDEX IF NOT EXISTS idx_knowledge_embeddings_model_id
43+
ON knowledge_embeddings(model_id);
44+
45+
-- =============================================================================
46+
-- DOCUMENT QUERIES
47+
-- =============================================================================
48+
49+
insertDocument:
50+
INSERT INTO knowledge_documents(id, title, source_uri, imported_at, content_hash)
51+
VALUES (?, ?, ?, ?, ?);
52+
53+
upsertDocument:
54+
INSERT OR REPLACE INTO knowledge_documents(id, title, source_uri, imported_at, content_hash)
55+
VALUES (?, ?, ?, ?, ?);
56+
57+
getDocumentById:
58+
SELECT * FROM knowledge_documents WHERE id = ?;
59+
60+
getDocumentByContentHash:
61+
SELECT * FROM knowledge_documents WHERE content_hash = ? LIMIT 1;
62+
63+
listDocuments:
64+
SELECT * FROM knowledge_documents
65+
ORDER BY imported_at DESC
66+
LIMIT :limit;
67+
68+
deleteDocument:
69+
DELETE FROM knowledge_documents WHERE id = ?;
70+
71+
countDocuments:
72+
SELECT COUNT(*) FROM knowledge_documents;
73+
74+
-- =============================================================================
75+
-- CHUNK QUERIES
76+
-- =============================================================================
77+
78+
insertChunk:
79+
INSERT INTO knowledge_chunks(id, document_id, chunk_index, text, char_start, char_end)
80+
VALUES (?, ?, ?, ?, ?, ?);
81+
82+
updateChunkText:
83+
UPDATE knowledge_chunks
84+
SET text = ?, char_start = ?, char_end = ?
85+
WHERE id = ?;
86+
87+
getChunkById:
88+
SELECT * FROM knowledge_chunks WHERE id = ?;
89+
90+
getChunksByDocument:
91+
SELECT * FROM knowledge_chunks
92+
WHERE document_id = ?
93+
ORDER BY chunk_index ASC;
94+
95+
deleteChunksForDocument:
96+
DELETE FROM knowledge_chunks WHERE document_id = ?;
97+
98+
countChunks:
99+
SELECT COUNT(*) FROM knowledge_chunks;
100+
101+
-- =============================================================================
102+
-- EMBEDDING QUERIES
103+
-- =============================================================================
104+
105+
insertEmbedding:
106+
INSERT INTO knowledge_embeddings(chunk_id, model_id, vector_blob, created_at)
107+
VALUES (?, ?, ?, ?);
108+
109+
upsertEmbedding:
110+
INSERT OR REPLACE INTO knowledge_embeddings(chunk_id, model_id, vector_blob, created_at)
111+
VALUES (?, ?, ?, ?);
112+
113+
getEmbedding:
114+
SELECT * FROM knowledge_embeddings
115+
WHERE chunk_id = ? AND model_id = ?;
116+
117+
getEmbeddingsForModel:
118+
SELECT * FROM knowledge_embeddings
119+
WHERE model_id = ?;
120+
121+
-- Joined view: every chunk with its embedding for a specific model.
122+
-- Platform similarity scoring iterates this set in memory.
123+
getChunkEmbeddingsForModel:
124+
SELECT
125+
c.id AS chunk_id,
126+
c.document_id AS document_id,
127+
c.chunk_index AS chunk_index,
128+
c.text AS text,
129+
c.char_start AS char_start,
130+
c.char_end AS char_end,
131+
e.model_id AS model_id,
132+
e.vector_blob AS vector_blob,
133+
e.created_at AS created_at
134+
FROM knowledge_chunks c
135+
INNER JOIN knowledge_embeddings e ON c.id = e.chunk_id
136+
WHERE e.model_id = ?;
137+
138+
deleteEmbeddingsForChunk:
139+
DELETE FROM knowledge_embeddings WHERE chunk_id = ?;
140+
141+
deleteEmbeddingsForModel:
142+
DELETE FROM knowledge_embeddings WHERE model_id = ?;
143+
144+
countEmbeddings:
145+
SELECT COUNT(*) FROM knowledge_embeddings;

0 commit comments

Comments
 (0)