socket-link
diff --git a/‎ampere-core/src/commonMain/kotlin/link/socket/ampere/knowledge/EmbeddingVector.kt‎
Lines changed: 120 additions & 0 deletions b/‎ampere-core/src/commonMain/kotlin/link/socket/ampere/knowledge/EmbeddingVector.kt‎
Lines changed: 120 additions & 0 deletions
diff --git a/‎ampere-core/src/commonMain/kotlin/link/socket/ampere/knowledge/KnowledgeStore.kt‎
Lines changed: 113 additions & 0 deletions b/‎ampere-core/src/commonMain/kotlin/link/socket/ampere/knowledge/KnowledgeStore.kt‎
Lines changed: 113 additions & 0 deletions
diff --git a/‎ampere-core/src/commonMain/sqldelight/link/socket/ampere/db/Knowledge.sq‎
Lines changed: 145 additions & 0 deletions b/‎ampere-core/src/commonMain/sqldelight/link/socket/ampere/db/Knowledge.sq‎
Lines changed: 145 additions & 0 deletions
@@ -0,0 +1,120 @@
+package link.socket.ampere.knowledge
+
+import kotlin.math.sqrt
+
+/**
+ * A dense vector representation of a document chunk produced by an embedding model.
+ *
+ * Provides similarity primitives ([cosineSimilarity], [dotProduct]) used by the
+ * on-device knowledge retrieval pipeline, plus a stable byte serialization
+ * ([toBlob] / [fromBlob]) for SQLDelight BLOB persistence.
+ *
+ * Values use structural equality based on the underlying [FloatArray] contents.
+ */
+class EmbeddingVector(
+    val values: FloatArray,
+) {
+    init {
+        require(values.isNotEmpty()) { "EmbeddingVector must have at least one dimension" }
+    }
+
+    /** Number of dimensions in the vector. */
+    val dimension: Int get() = values.size
+
+    /**
+     * Sum of element-wise products with [other].
+     *
+     * @throws IllegalArgumentException if [dimension] does not match.
+     */
+    fun dotProduct(other: EmbeddingVector): Float {
+        require(dimension == other.dimension) {
+            "Dimension mismatch: $dimension vs ${other.dimension}"
+        }
+        var sum = 0f
+        val a = values
+        val b = other.values
+        for (i in a.indices) {
+            sum += a[i] * b[i]
+        }
+        return sum
+    }
+
+    /**
+     * Cosine similarity in `[-1, 1]`. Returns `0` when either vector has zero magnitude.
+     *
+     * @throws IllegalArgumentException if [dimension] does not match.
+     */
+    fun cosineSimilarity(other: EmbeddingVector): Float {
+        require(dimension == other.dimension) {
+            "Dimension mismatch: $dimension vs ${other.dimension}"
+        }
+        var dot = 0f
+        var magA = 0f
+        var magB = 0f
+        val a = values
+        val b = other.values
+        for (i in a.indices) {
+            val x = a[i]
+            val y = b[i]
+            dot += x * y
+            magA += x * x
+            magB += y * y
+        }
+        val denominator = sqrt(magA) * sqrt(magB)
+        return if (denominator == 0f) 0f else dot / denominator
+    }
+
+    /**
+     * Serialize to a fixed-size big-endian byte array (`dimension * 4` bytes) for BLOB storage.
+     *
+     * Format: each [Float] is encoded via [Float.toRawBits] in big-endian order so the
+     * payload is portable across all Kotlin Multiplatform targets.
+     */
+    fun toBlob(): ByteArray {
+        val bytes = ByteArray(values.size * BYTES_PER_FLOAT)
+        for (i in values.indices) {
+            val bits = values[i].toRawBits()
+            val offset = i * BYTES_PER_FLOAT
+            bytes[offset] = (bits ushr 24).toByte()
+            bytes[offset + 1] = (bits ushr 16).toByte()
+            bytes[offset + 2] = (bits ushr 8).toByte()
+            bytes[offset + 3] = bits.toByte()
+        }
+        return bytes
+    }
+
+    override fun equals(other: Any?): Boolean {
+        if (this === other) return true
+        if (other !is EmbeddingVector) return false
+        return values.contentEquals(other.values)
+    }
+
+    override fun hashCode(): Int = values.contentHashCode()
+
+    override fun toString(): String = "EmbeddingVector(dimension=$dimension)"
+
+    companion object {
+        private const val BYTES_PER_FLOAT = 4
+
+        /**
+         * Decode a vector previously written by [toBlob].
+         *
+         * @throws IllegalArgumentException if [bytes] length is not a positive multiple of 4.
+         */
+        fun fromBlob(bytes: ByteArray): EmbeddingVector {
+            require(bytes.isNotEmpty() && bytes.size % BYTES_PER_FLOAT == 0) {
+                "Embedding blob must be a positive multiple of $BYTES_PER_FLOAT bytes, got ${bytes.size}"
+            }
+            val floats = FloatArray(bytes.size / BYTES_PER_FLOAT)
+            for (i in floats.indices) {
+                val offset = i * BYTES_PER_FLOAT
+                val bits = (bytes[offset].toInt() and 0xff shl 24) or
+                    (bytes[offset + 1].toInt() and 0xff shl 16) or
+                    (bytes[offset + 2].toInt() and 0xff shl 8) or
+                    (bytes[offset + 3].toInt() and 0xff)
+                floats[i] = Float.fromBits(bits)
+            }
+            return EmbeddingVector(floats)
+        }
+    }
+}
@@ -0,0 +1,113 @@
+package link.socket.ampere.knowledge
+
+import kotlinx.datetime.Instant
+
+/**
+ * On-device knowledge primitive: stores documents, chunks them, embeds the chunks,
+ * and answers retrieval queries over the local index.
+ *
+ * Implementations are platform-specific (W1.8 iOS, W1.9 Android) — this commonMain
+ * surface is the contract those pipelines target.
+ *
+ * All work runs on-device. No cloud embedding APIs are invoked from this layer.
+ */
+interface KnowledgeStore {
+
+    /**
+     * Persist a new document or update an existing one keyed by content hash.
+     *
+     * Implementations should be idempotent: importing a document whose
+     * [KnowledgeDocument.contentHash] already exists must not create duplicate
+     * chunks or embeddings.
+     */
+    suspend fun addDocument(document: KnowledgeDocument): Result<KnowledgeDocument>
+
+    /**
+     * Split [documentId]'s content into chunks and produce embeddings for each chunk
+     * using the on-device embedding model identified by [modelId].
+     *
+     * @param documentId Document previously persisted via [addDocument].
+     * @param modelId Identifier of the embedding model (e.g. `"all-MiniLM-L6-v2"`).
+     * @return The chunks that were embedded, in order.
+     */
+    suspend fun chunkAndEmbed(
+        documentId: String,
+        modelId: String,
+    ): Result<List<KnowledgeChunk>>
+
+    /**
+     * Retrieve chunks relevant to [text] using the supplied [mode].
+     *
+     * @param text Free-form query.
+     * @param limit Maximum number of results to return.
+     * @param mode Retrieval strategy. Defaults to [QueryMode.HYBRID].
+     */
+    suspend fun query(
+        text: String,
+        limit: Int = DEFAULT_QUERY_LIMIT,
+        mode: QueryMode = QueryMode.HYBRID,
+    ): Result<List<KnowledgeQueryResult>>
+
+    companion object {
+        const val DEFAULT_QUERY_LIMIT: Int = 10
+    }
+}
+
+/**
+ * Strategy for resolving a [KnowledgeStore.query] call.
+ */
+enum class QueryMode {
+    /** Cosine-similarity search over [EmbeddingVector]s. */
+    SEMANTIC,
+
+    /** FTS5 keyword search over chunk text. */
+    KEYWORD,
+
+    /** Combine [SEMANTIC] and [KEYWORD] results, blending scores. */
+    HYBRID,
+}
+
+/**
+ * A document imported into the on-device knowledge store.
+ *
+ * @param id Stable identifier.
+ * @param title Human-readable title.
+ * @param sourceUri Optional pointer to the original source (file path, URL, etc.).
+ * @param importedAt When the document was imported.
+ * @param contentHash Hash of the source content; used for idempotent re-import.
+ * @param content Raw textual content. May be empty if the implementation lazily
+ *        loads content from [sourceUri].
+ */
+data class KnowledgeDocument(
+    val id: String,
+    val title: String,
+    val sourceUri: String?,
+    val importedAt: Instant,
+    val contentHash: String,
+    val content: String = "",
+)
+
+/**
+ * A chunk of a [KnowledgeDocument]'s text — the unit at which embeddings are produced
+ * and retrieval results are returned.
+ */
+data class KnowledgeChunk(
+    val id: String,
+    val documentId: String,
+    val chunkIndex: Int,
+    val text: String,
+    val charStart: Int,
+    val charEnd: Int,
+)
+
+/**
+ * A single retrieval hit returned by [KnowledgeStore.query].
+ *
+ * @param chunk The matching chunk.
+ * @param score Mode-dependent score (cosine similarity, FTS rank, or hybrid blend).
+ *        Higher scores indicate better matches.
+ */
+data class KnowledgeQueryResult(
+    val chunk: KnowledgeChunk,
+    val score: Float,
+)
@@ -0,0 +1,145 @@
+-- On-device knowledge schema: documents, chunks, and embedding vectors.
+-- Vectors are stored as raw BLOBs serialized via EmbeddingVector.toBlob().
+-- Platform retrieval pipelines (W1.8 iOS, W1.9 Android) target this surface.
+
+CREATE TABLE IF NOT EXISTS knowledge_documents (
+    id TEXT PRIMARY KEY NOT NULL,
+    title TEXT NOT NULL,
+    source_uri TEXT,
+    imported_at INTEGER NOT NULL,
+    content_hash TEXT NOT NULL
+);
+
+CREATE INDEX IF NOT EXISTS idx_knowledge_documents_imported_at
+    ON knowledge_documents(imported_at DESC);
+
+CREATE INDEX IF NOT EXISTS idx_knowledge_documents_content_hash
+    ON knowledge_documents(content_hash);
+
+CREATE TABLE IF NOT EXISTS knowledge_chunks (
+    id TEXT PRIMARY KEY NOT NULL,
+    document_id TEXT NOT NULL,
+    chunk_index INTEGER NOT NULL,
+    text TEXT NOT NULL,
+    char_start INTEGER NOT NULL,
+    char_end INTEGER NOT NULL,
+    FOREIGN KEY (document_id) REFERENCES knowledge_documents(id) ON DELETE CASCADE,
+    UNIQUE (document_id, chunk_index)
+);
+
+CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_document_id
+    ON knowledge_chunks(document_id);
+
+CREATE TABLE IF NOT EXISTS knowledge_embeddings (
+    chunk_id TEXT NOT NULL,
+    model_id TEXT NOT NULL,
+    vector_blob BLOB NOT NULL,
+    created_at INTEGER NOT NULL,
+    PRIMARY KEY (chunk_id, model_id),
+    FOREIGN KEY (chunk_id) REFERENCES knowledge_chunks(id) ON DELETE CASCADE
+);
+
+CREATE INDEX IF NOT EXISTS idx_knowledge_embeddings_model_id
+    ON knowledge_embeddings(model_id);
+
+-- =============================================================================
+-- DOCUMENT QUERIES
+-- =============================================================================
+
+insertDocument:
+INSERT INTO knowledge_documents(id, title, source_uri, imported_at, content_hash)
+VALUES (?, ?, ?, ?, ?);
+
+upsertDocument:
+INSERT OR REPLACE INTO knowledge_documents(id, title, source_uri, imported_at, content_hash)
+VALUES (?, ?, ?, ?, ?);
+
+getDocumentById:
+SELECT * FROM knowledge_documents WHERE id = ?;
+
+getDocumentByContentHash:
+SELECT * FROM knowledge_documents WHERE content_hash = ? LIMIT 1;
+
+listDocuments:
+SELECT * FROM knowledge_documents
+ORDER BY imported_at DESC
+LIMIT :limit;
+
+deleteDocument:
+DELETE FROM knowledge_documents WHERE id = ?;
+
+countDocuments:
+SELECT COUNT(*) FROM knowledge_documents;
+
+-- =============================================================================
+-- CHUNK QUERIES
+-- =============================================================================
+
+insertChunk:
+INSERT INTO knowledge_chunks(id, document_id, chunk_index, text, char_start, char_end)
+VALUES (?, ?, ?, ?, ?, ?);
+
+updateChunkText:
+UPDATE knowledge_chunks
+SET text = ?, char_start = ?, char_end = ?
+WHERE id = ?;
+
+getChunkById:
+SELECT * FROM knowledge_chunks WHERE id = ?;
+
+getChunksByDocument:
+SELECT * FROM knowledge_chunks
+WHERE document_id = ?
+ORDER BY chunk_index ASC;
+
+deleteChunksForDocument:
+DELETE FROM knowledge_chunks WHERE document_id = ?;
+
+countChunks:
+SELECT COUNT(*) FROM knowledge_chunks;
+
+-- =============================================================================
+-- EMBEDDING QUERIES
+-- =============================================================================
+
+insertEmbedding:
+INSERT INTO knowledge_embeddings(chunk_id, model_id, vector_blob, created_at)
+VALUES (?, ?, ?, ?);
+
+upsertEmbedding:
+INSERT OR REPLACE INTO knowledge_embeddings(chunk_id, model_id, vector_blob, created_at)
+VALUES (?, ?, ?, ?);
+
+getEmbedding:
+SELECT * FROM knowledge_embeddings
+WHERE chunk_id = ? AND model_id = ?;
+
+getEmbeddingsForModel:
+SELECT * FROM knowledge_embeddings
+WHERE model_id = ?;
+
+-- Joined view: every chunk with its embedding for a specific model.
+-- Platform similarity scoring iterates this set in memory.
+getChunkEmbeddingsForModel:
+SELECT
+    c.id AS chunk_id,
+    c.document_id AS document_id,
+    c.chunk_index AS chunk_index,
+    c.text AS text,
+    c.char_start AS char_start,
+    c.char_end AS char_end,
+    e.model_id AS model_id,
+    e.vector_blob AS vector_blob,
+    e.created_at AS created_at
+FROM knowledge_chunks c
+INNER JOIN knowledge_embeddings e ON c.id = e.chunk_id
+WHERE e.model_id = ?;
+
+deleteEmbeddingsForChunk:
+DELETE FROM knowledge_embeddings WHERE chunk_id = ?;
+
+deleteEmbeddingsForModel:
+DELETE FROM knowledge_embeddings WHERE model_id = ?;
+
+countEmbeddings:
+SELECT COUNT(*) FROM knowledge_embeddings;