chore: update llama-cpp readme (#15)

hlhr202 · web-flow · commit a10a0b59f8d4 · 2023-04-13T10:18:33.000+08:00
* feat: impl llama cpp tokenization/embedding

* chore: readme
diff --git a/README-zh-CN.md b/README-zh-CN.md
@@ -23,10 +23,13 @@ Node.js运行的大语言模型LLaMA。
   - [模型获取](#模型获取)
     - [模型版本](#模型版本)
   - [使用（llama.cpp后端）](#使用llamacpp后端)
-  - [使用（llama-rs后端）](#使用llama-rs后端)
     - [推理](#推理)
     - [分词](#分词)
     - [嵌入](#嵌入)
+  - [使用（llama-rs后端）](#使用llama-rs后端)
+    - [推理](#推理-1)
+    - [分词](#分词-1)
+    - [嵌入](#嵌入-1)
   - [关于性能](#关于性能)
     - [手动编译 (from node\_modules)](#手动编译-from-node_modules)
     - [手动编译 (from source)](#手动编译-from-source)
@@ -96,11 +99,11 @@ llama-rs后端现在只支持GGML / GGMF模型。llama.cpp后端仅支持GGJT模
 
 如果您希望同时进行多个推理会话，则需要创建多个LLama实例。
 
-llama.cpp后端现仅支持推理. 嵌入和分词功能请等待后期更新。
+### 推理
 
 ```typescript
 import { LLama } from "llama-node";
-import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp";
+import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp.js";
 import path from "path";
 
 const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-4bit-rev1.bin");
@@ -148,6 +151,79 @@ llama.createCompletion(
 
 ```
 
+### 分词
+
+```typescript
+import { LLama } from "llama-node";
+import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp.js";
+import path from "path";
+
+const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-4bit-rev1.bin");
+
+const llama = new LLama(LLamaCpp);
+
+const config: LoadConfig = {
+    path: model,
+    enableLogging: true,
+    nCtx: 1024,
+    nParts: -1,
+    seed: 0,
+    f16Kv: false,
+    logitsAll: false,
+    vocabOnly: false,
+    useMlock: false,
+    embedding: false,
+};
+
+llama.load(config);
+
+const content = "how are you?";
+
+llama.tokenize({ content, nCtx: 2048 }).then(console.log);
+
+```
+
+### 嵌入
+```typescript
+import { LLama } from "llama-node";
+import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp.js";
+import path from "path";
+
+const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-4bit-rev1.bin");
+
+const llama = new LLama(LLamaCpp);
+
+const config: LoadConfig = {
+    path: model,
+    enableLogging: true,
+    nCtx: 1024,
+    nParts: -1,
+    seed: 0,
+    f16Kv: false,
+    logitsAll: false,
+    vocabOnly: false,
+    useMlock: false,
+    embedding: false,
+};
+
+llama.load(config);
+
+const prompt = `Who is the president of the United States?`;
+
+const params = {
+    nThreads: 4,
+    nTokPredict: 2048,
+    topK: 40,
+    topP: 0.1,
+    temp: 0.2,
+    repeatPenalty: 1,
+    prompt,
+};
+
+llama.getEmbedding(params).then(console.log);
+
+```
+
 ---
 
 ## 使用（llama-rs后端）
@@ -160,7 +236,7 @@ llama.createCompletion(
 
 ```typescript
 import { LLama } from "llama-node";
-import { LLamaRS } from "llama-node/dist/llm/llama-rs";
+import { LLamaRS } from "llama-node/dist/llm/llama-rs.js";
 import path from "path";
 
 const model = path.resolve(process.cwd(), "./ggml-alpaca-7b-q4.bin");
@@ -203,7 +279,7 @@ llama.createCompletion(
 
 ```typescript
 import { LLama } from "llama-node";
-import { LLamaRS } from "llama-node/dist/llm/llama-rs";
+import { LLamaRS } from "llama-node/dist/llm/llama-rs.js";
 import path from "path";
 
 const model = path.resolve(process.cwd(), "./ggml-alpaca-7b-q4.bin");
@@ -223,7 +299,7 @@ llama.tokenize(content).then(console.log);
 
 ```typescript
 import { LLama } from "llama-node";
-import { LLamaRS } from "llama-node/dist/llm/llama-rs";
+import { LLamaRS } from "llama-node/dist/llm/llama-rs.js";
 import path from "path";
 import fs from "fs";
 
diff --git a/README.md b/README.md
@@ -25,10 +25,13 @@ This project is in an early stage, the API for nodejs may change in the future,
   - [Getting the weights](#getting-the-weights)
     - [Model versioning](#model-versioning)
   - [Usage (llama.cpp backend)](#usage-llamacpp-backend)
-  - [Usage (llama-rs backend)](#usage-llama-rs-backend)
     - [Inference](#inference)
     - [Tokenize](#tokenize)
     - [Embedding](#embedding)
+  - [Usage (llama-rs backend)](#usage-llama-rs-backend)
+    - [Inference](#inference-1)
+    - [Tokenize](#tokenize-1)
+    - [Embedding](#embedding-1)
   - [Performance related](#performance-related)
     - [Manual compilation (from node\_modules)](#manual-compilation-from-node_modules)
     - [Manual compilation (from source)](#manual-compilation-from-source)
@@ -98,11 +101,11 @@ The current version supports only one inference session on one LLama instance at
 
 If you wish to have multiple inference sessions concurrently, you need to create multiple LLama instances
 
-llama.cpp backend now only supports inferencing. Please wait for embedding and tokenization feature.
+### Inference
 
 ```typescript
 import { LLama } from "llama-node";
-import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp";
+import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp.js";
 import path from "path";
 
 const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-4bit-rev1.bin");
@@ -150,6 +153,79 @@ llama.createCompletion(
 
 ```
 
+### Tokenize
+
+```typescript
+import { LLama } from "llama-node";
+import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp.js";
+import path from "path";
+
+const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-4bit-rev1.bin");
+
+const llama = new LLama(LLamaCpp);
+
+const config: LoadConfig = {
+    path: model,
+    enableLogging: true,
+    nCtx: 1024,
+    nParts: -1,
+    seed: 0,
+    f16Kv: false,
+    logitsAll: false,
+    vocabOnly: false,
+    useMlock: false,
+    embedding: false,
+};
+
+llama.load(config);
+
+const content = "how are you?";
+
+llama.tokenize({ content, nCtx: 2048 }).then(console.log);
+
+```
+
+### Embedding
+```typescript
+import { LLama } from "llama-node";
+import { LLamaCpp, LoadConfig } from "llama-node/dist/llm/llama-cpp.js";
+import path from "path";
+
+const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-4bit-rev1.bin");
+
+const llama = new LLama(LLamaCpp);
+
+const config: LoadConfig = {
+    path: model,
+    enableLogging: true,
+    nCtx: 1024,
+    nParts: -1,
+    seed: 0,
+    f16Kv: false,
+    logitsAll: false,
+    vocabOnly: false,
+    useMlock: false,
+    embedding: false,
+};
+
+llama.load(config);
+
+const prompt = `Who is the president of the United States?`;
+
+const params = {
+    nThreads: 4,
+    nTokPredict: 2048,
+    topK: 40,
+    topP: 0.1,
+    temp: 0.2,
+    repeatPenalty: 1,
+    prompt,
+};
+
+llama.getEmbedding(params).then(console.log);
+
+```
+
 ---
 
 ## Usage (llama-rs backend)
@@ -162,7 +238,7 @@ If you wish to have multiple inference sessions concurrently, you need to create
 
 ```typescript
 import { LLama } from "llama-node";
-import { LLamaRS } from "llama-node/dist/llm/llama-rs";
+import { LLamaRS } from "llama-node/dist/llm/llama-rs.js";
 import path from "path";
 
 const model = path.resolve(process.cwd(), "./ggml-alpaca-7b-q4.bin");
@@ -205,7 +281,7 @@ Get tokenization result from LLaMA
 
 ```typescript
 import { LLama } from "llama-node";
-import { LLamaRS } from "llama-node/dist/llm/llama-rs";
+import { LLamaRS } from "llama-node/dist/llm/llama-rs.js";
 import path from "path";
 
 const model = path.resolve(process.cwd(), "./ggml-alpaca-7b-q4.bin");
@@ -226,7 +302,7 @@ Preview version, embedding end token may change in the future. Do not use it in
 
 ```typescript
 import { LLama } from "llama-node";
-import { LLamaRS } from "llama-node/dist/llm/llama-rs";
+import { LLamaRS } from "llama-node/dist/llm/llama-rs.js";
 import path from "path";
 import fs from "fs";
 
diff --git a/example/llama-cpp/embedding.ts b/example/llama-cpp/embedding.ts
@@ -0,0 +1,36 @@
+import { LLama } from "../../src";
+import { LLamaCpp, LoadConfig } from "../../src/llm/llama-cpp";
+import path from "path";
+
+const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-4bit-rev1.bin");
+
+const llama = new LLama(LLamaCpp);
+
+const config: LoadConfig = {
+    path: model,
+    enableLogging: true,
+    nCtx: 1024,
+    nParts: -1,
+    seed: 0,
+    f16Kv: false,
+    logitsAll: false,
+    vocabOnly: false,
+    useMlock: false,
+    embedding: false,
+};
+
+llama.load(config);
+
+const prompt = `Who is the president of the United States?`;
+
+const params = {
+    nThreads: 4,
+    nTokPredict: 2048,
+    topK: 40,
+    topP: 0.1,
+    temp: 0.2,
+    repeatPenalty: 1,
+    prompt,
+};
+
+llama.getEmbedding(params).then(console.log);
diff --git a/example/llama-cpp/tokenize.ts b/example/llama-cpp/tokenize.ts
@@ -0,0 +1,26 @@
+import { LLama } from "../../src";
+import { LLamaCpp, LoadConfig } from "../../src/llm/llama-cpp";
+import path from "path";
+
+const model = path.resolve(process.cwd(), "./ggml-vicuna-7b-4bit-rev1.bin");
+
+const llama = new LLama(LLamaCpp);
+
+const config: LoadConfig = {
+    path: model,
+    enableLogging: true,
+    nCtx: 1024,
+    nParts: -1,
+    seed: 0,
+    f16Kv: false,
+    logitsAll: false,
+    vocabOnly: false,
+    useMlock: false,
+    embedding: false,
+};
+
+llama.load(config);
+
+const content = "how are you?";
+
+llama.tokenize({ content, nCtx: 2048 }).then(console.log);
diff --git a/src/index.ts b/src/index.ts
@@ -1,5 +1,5 @@
 import { CompletionCallback } from "./llm";
-import { LLM } from "./llm";
+import type { LLM } from "./llm";
 
 export class LLama<
     Instance,
diff --git a/src/llm.d.ts b/src/llm.d.ts
diff --git a/src/llm/llama-cpp.ts b/src/llm/llama-cpp.ts
@@ -7,7 +7,7 @@ import {
     TokenizeResultType,
 } from "@llama-node/llama-cpp";
 
-import { LLM } from "../llm";
+import type { LLM } from "../llm";
 
 export interface LoadConfig extends LlamaContextParams {
     path: string;
diff --git a/src/llm/llama-rs.ts b/src/llm/llama-rs.ts
@@ -1,5 +1,5 @@
 import { LLama, LLamaConfig, LLamaInferenceArguments } from "@llama-node/core";
-import { LLM } from "../llm";
+import type { LLM } from "../llm";
 
 export class LLamaRS
     implements