Llama cpp version update (#401)

spikex · web-flow · commit cc86646c4e7a · 2023-11-29T17:21:54.000-05:00
* -1 is not a valid seed for llama_cpp, use its default of 0

* Update llama_cpp API to ~&gt; 0.9.4

* Note breaking change in CHANGELOG.md
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,8 @@
 ## [Unreleased]
 
+## [0.8.0]
+- [BREAKING] Updated llama_cpp.rb to 0.9.4. The model file format used by the underlying llama.cpp library has changed to GGUF. llama.cpp ships with scripts to convert existing files and GGUF format models can be downloaded from HuggingFace.
+
 ## [0.7.5] - 2023-11-13
 - Fixes
 
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -187,7 +187,7 @@ GEM
     jwt (2.7.1)
     language_server-protocol (3.17.0.3)
     lint_roller (1.0.0)
-    llama_cpp (0.3.7)
+    llama_cpp (0.9.4)
     loofah (2.21.1)
       crass (~> 1.0.2)
       nokogiri (>= 1.5.9)
@@ -395,7 +395,7 @@ DEPENDENCIES
   hnswlib (~> 0.8.1)
   hugging-face (~> 0.3.4)
   langchainrb!
-  llama_cpp (~> 0.3.7)
+  llama_cpp (~> 0.9.4)
   milvus (~> 0.9.2)
   nokogiri (~> 1.13)
   open-weather-ruby-client (~> 0.4.0)
diff --git a/langchain.gemspec b/langchain.gemspec
@@ -57,7 +57,7 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "hnswlib", "~> 0.8.1"
   spec.add_development_dependency "hugging-face", "~> 0.3.4"
   spec.add_development_dependency "milvus", "~> 0.9.2"
-  spec.add_development_dependency "llama_cpp", "~> 0.3.7"
+  spec.add_development_dependency "llama_cpp", "~> 0.9.4"
   spec.add_development_dependency "nokogiri", "~> 1.13"
   spec.add_development_dependency "open-weather-ruby-client", "~> 0.4.0"
   spec.add_development_dependency "pg", "~> 1.5"
diff --git a/lib/langchain/llm/llama_cpp.rb b/lib/langchain/llm/llama_cpp.rb
@@ -22,7 +22,7 @@ class LlamaCpp < Base
     # @param n_ctx [Integer] The number of context tokens to use
     # @param n_threads [Integer] The CPU number of threads to use
     # @param seed [Integer] The seed to use
-    def initialize(model_path:, n_gpu_layers: 1, n_ctx: 2048, n_threads: 1, seed: -1)
+    def initialize(model_path:, n_gpu_layers: 1, n_ctx: 2048, n_threads: 1, seed: 0)
       depends_on "llama_cpp"
 
       @model_path = model_path
@@ -33,30 +33,25 @@ def initialize(model_path:, n_gpu_layers: 1, n_ctx: 2048, n_threads: 1, seed: -1
     end
 
     # @param text [String] The text to embed
-    # @param n_threads [Integer] The number of CPU threads to use
     # @return [Array<Float>] The embedding
-    def embed(text:, n_threads: nil)
+    def embed(text:)
       # contexts are kinda stateful when it comes to embeddings, so allocate one each time
       context = embedding_context
 
-      embedding_input = context.tokenize(text: text, add_bos: true)
+      embedding_input = @model.tokenize(text: text, add_bos: true)
       return unless embedding_input.size.positive?
 
-      n_threads ||= self.n_threads
-
-      context.eval(tokens: embedding_input, n_past: 0, n_threads: n_threads)
-      context.embeddings
+      context.eval(tokens: embedding_input, n_past: 0)
+      Langchain::LLM::LlamaCppResponse.new(context, model: context.model.desc)
     end
 
     # @param prompt [String] The prompt to complete
     # @param n_predict [Integer] The number of tokens to predict
-    # @param n_threads [Integer] The number of CPU threads to use
     # @return [String] The completed prompt
-    def complete(prompt:, n_predict: 128, n_threads: nil)
-      n_threads ||= self.n_threads
+    def complete(prompt:, n_predict: 128)
       # contexts do not appear to be stateful when it comes to completion, so re-use the same one
       context = completion_context
-      ::LLaMACpp.generate(context, prompt, n_threads: n_threads, n_predict: n_predict)
+      ::LLaMACpp.generate(context, prompt, n_predict: n_predict)
     end
 
     private
@@ -71,23 +66,30 @@ def build_context_params(embeddings: false)
 
       context_params.seed = seed
       context_params.n_ctx = n_ctx
-      context_params.n_gpu_layers = n_gpu_layers
+      context_params.n_threads = n_threads
       context_params.embedding = embeddings
 
       context_params
     end
 
+    def build_model_params
+      model_params = ::LLaMACpp::ModelParams.new
+      model_params.n_gpu_layers = n_gpu_layers
+
+      model_params
+    end
+
     def build_model(embeddings: false)
       return @model if defined?(@model)
-      @model = ::LLaMACpp::Model.new(model_path: model_path, params: build_context_params(embeddings: embeddings))
+      @model = ::LLaMACpp::Model.new(model_path: model_path, params: build_model_params)
     end
 
     def build_completion_context
-      ::LLaMACpp::Context.new(model: build_model)
+      ::LLaMACpp::Context.new(model: build_model, params: build_context_params(embeddings: false))
     end
 
     def build_embedding_context
-      ::LLaMACpp::Context.new(model: build_model(embeddings: true))
+      ::LLaMACpp::Context.new(model: build_model, params: build_context_params(embeddings: true))
     end
 
     def completion_context
diff --git a/lib/langchain/llm/response/llama_cpp_response.rb b/lib/langchain/llm/response/llama_cpp_response.rb
@@ -0,0 +1,13 @@
+# frozen_string_literal: true
+
+module Langchain::LLM
+  class LlamaCppResponse < BaseResponse
+    def embedding
+      embeddings
+    end
+
+    def embeddings
+      raw_response.embeddings
+    end
+  end
+end
diff --git a/spec/langchain/llm/llama_cpp_spec.rb b/spec/langchain/llm/llama_cpp_spec.rb
@@ -18,13 +18,15 @@
     let(:embedding) { [0.1, 0.2, 0.3] }
 
     before do
-      allow(llama_context).to receive(:tokenize).and_return([1, 9029])
+      allow(llama_model).to receive(:tokenize).and_return([1, 9029])
+      allow(llama_model).to receive(:desc).and_return("test")
+      allow(llama_context).to receive(:model).and_return(llama_model)
       allow(llama_context).to receive(:eval)
       allow(llama_context).to receive(:embeddings).and_return(embedding)
     end
 
     it "generates an embedding" do
-      expect(subject.embed(text: "Hello World")).to eq(embedding)
+      expect(subject.embed(text: "Hello World").embedding).to eq(embedding)
     end
   end