fix: clear llama memory cache on subsequent embeddings to prevent context overflow (#73)

uchebnick · web-flow · commit eb01ad6d39a8 · 2026-04-09T23:53:08.000+03:00
* fix(llama): prevent context overflow and filter invalid utf8/null bytes

* fix(embed): clear llama KV cache memory between embeddings

* fix(lint): handle or ignore error returns in embedder

---------

Co-authored-by: uchebnick &lt;uchebnick@users.noreply.github.com&gt;
diff --git a/internal/embed/llama/embedder.go b/internal/embed/llama/embedder.go
@@ -116,6 +116,8 @@ func New(cfg Config) (*Embedder, error) {
 
 	ctxParams := llama.ContextDefaultParams()
 	ctxParams.NCtx = uint32(cfg.ContextSize)
+	ctxParams.NBatch = uint32(cfg.ContextSize)
+	ctxParams.NUbatch = uint32(cfg.ContextSize)
 	ctxParams.PoolingType = cfg.Pooling
 	ctxParams.Embeddings = 1
 
@@ -185,28 +187,35 @@ func (e *Embedder) Dim() int {
 }
 
 func (e *Embedder) Embed(text string) ([]float32, error) {
-	if e == nil {
-		return nil, fmt.Errorf("nil embedder")
-	}
+	e.mu.Lock()
+	defer e.mu.Unlock()
 
 	text = normalizeText(text)
-	if text == "" {
-		return nil, fmt.Errorf("empty text")
+
+	tokens := llama.Tokenize(e.vocab, text, true, false)
+	if len(tokens) == 0 {
+		return nil, nil // Return empty if text results in zero tokens
 	}
 
-	e.mu.Lock()
-	defer e.mu.Unlock()
+	// Truncate tokens if they exceed ContextSize
+	if len(tokens) > e.contextSize {
+		tokens = tokens[:e.contextSize]
+	}
 
-	tokens := llama.Tokenize(e.vocab, text, true, true)
-	if len(tokens) == 0 {
-		return nil, fmt.Errorf("tokenize returned zero tokens")
+	// Clear memory before processing new tokens
+	mem, err := llama.GetMemory(e.ctx)
+	if err == nil {
+		_ = llama.MemoryClear(mem, true)
 	}
 
 	if len(tokens) > e.contextSize {
 		tokens = tokens[:e.contextSize]
 	}
 
 	batch := llama.BatchGetOne(tokens)
+	defer func() {
+		_ = llama.BatchFree(batch)
+	}()
 
 	ret, err := llama.Decode(e.ctx, batch)
 	if err != nil {