Skip to content
This repository was archived by the owner on Sep 12, 2024. It is now read-only.

Commit e82222d

Browse files
authored
feature: implemented parallel inference for llama-rs, implemented naive sequential async inference for llama-cpp and rwkv-cpp (#52)
* feat: support parallel inference for llama-rs, support sequential async for llama-cpp and rwkv-cpp
1 parent a311873 commit e82222d

32 files changed

+558
-1415
lines changed

packages/cli/src/index.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import { existsSync } from "fs";
1212

1313
const convertType = ["q4_0", "q4_1", "f16", "f32"] as const;
1414

15-
type ConvertType = typeof convertType[number];
15+
type ConvertType = (typeof convertType)[number];
1616

1717
interface CLIInferenceArguments extends LLamaInferenceArguments, LLamaConfig {
1818
logger?: boolean;
@@ -75,7 +75,7 @@ class InferenceCommand implements yargs.CommandModule {
7575
if (logger) {
7676
LLama.enableLogger();
7777
}
78-
const llama = LLama.create({ path: absolutePath, numCtxTokens });
78+
const llama = await LLama.create({ path: absolutePath, numCtxTokens });
7979
llama.inference(rest, (result) => {
8080
switch (result.type) {
8181
case InferenceResultType.Data:

packages/core/__test__/index.spec.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ test(
66
async () => {
77
LLama.enableLogger();
88

9-
const llama = LLama.create({
9+
const llama = await LLama.create({
1010
path: process.env.model?.toString()!,
1111
numCtxTokens: 128,
1212
});

packages/core/example/cachesession.ts

Lines changed: 36 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -6,46 +6,50 @@ const saveSession = path.resolve(process.cwd(), "./tmp/session.bin");
66

77
LLama.enableLogger();
88

9-
const llama = LLama.create({
10-
path: model,
11-
numCtxTokens: 128,
12-
});
9+
const run = async () => {
10+
const llama = await LLama.create({
11+
path: model,
12+
numCtxTokens: 128,
13+
});
1314

14-
const template = `how are you`;
15+
const template = `how are you`;
1516

16-
const prompt = `Below is an instruction that describes a task. Write a response that appropriately completes the request.
17+
const prompt = `Below is an instruction that describes a task. Write a response that appropriately completes the request.
1718
1819
### Instruction:
1920
2021
${template}
2122
2223
### Response:`;
2324

24-
llama.inference(
25-
{
26-
prompt,
27-
numPredict: 128,
28-
temp: 0.2,
29-
topP: 1,
30-
topK: 40,
31-
repeatPenalty: 1,
32-
repeatLastN: 64,
33-
seed: 0,
34-
feedPrompt: true,
35-
feedPromptOnly: true,
36-
saveSession,
37-
},
38-
(response) => {
39-
switch (response.type) {
40-
case InferenceResultType.Data: {
41-
process.stdout.write(response.data?.token ?? "");
42-
break;
43-
}
44-
case InferenceResultType.End:
45-
case InferenceResultType.Error: {
46-
console.log(response);
47-
break;
25+
llama.inference(
26+
{
27+
prompt,
28+
numPredict: 128,
29+
temp: 0.2,
30+
topP: 1,
31+
topK: 40,
32+
repeatPenalty: 1,
33+
repeatLastN: 64,
34+
seed: 0,
35+
feedPrompt: true,
36+
feedPromptOnly: true,
37+
saveSession,
38+
},
39+
(response) => {
40+
switch (response.type) {
41+
case InferenceResultType.Data: {
42+
process.stdout.write(response.data?.token ?? "");
43+
break;
44+
}
45+
case InferenceResultType.End:
46+
case InferenceResultType.Error: {
47+
console.log(response);
48+
break;
49+
}
4850
}
4951
}
50-
}
51-
);
52+
);
53+
};
54+
55+
run();

packages/core/example/embedding.ts

Lines changed: 35 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,47 @@
1-
import { EmbeddingResultType, LLama } from "../index";
1+
import { LLama } from "../index";
22
import path from "path";
33
import fs from "fs";
44

55
const model = path.resolve(process.cwd(), "../../ggml-alpaca-7b-q4.bin");
66

77
LLama.enableLogger();
88

9-
const llama = LLama.create({
10-
path: model,
11-
numCtxTokens: 128,
12-
});
13-
14-
const getWordEmbeddings = (prompt: string, file: string) => {
15-
llama.getWordEmbeddings(
16-
{
17-
prompt,
18-
numPredict: 128,
19-
temp: 0.2,
20-
topP: 1,
21-
topK: 40,
22-
repeatPenalty: 1,
23-
repeatLastN: 64,
24-
seed: 0,
25-
},
26-
(response) => {
27-
switch (response.type) {
28-
case EmbeddingResultType.Data: {
29-
fs.writeFileSync(
30-
path.resolve(process.cwd(), file),
31-
JSON.stringify(response.data)
32-
);
33-
break;
34-
}
35-
case EmbeddingResultType.Error: {
36-
console.log(response);
37-
break;
38-
}
39-
}
40-
}
9+
const getWordEmbeddings = async (
10+
llama: LLama,
11+
prompt: string,
12+
file: string
13+
) => {
14+
const response = await llama.getWordEmbeddings({
15+
prompt,
16+
numPredict: 128,
17+
temp: 0.2,
18+
topP: 1,
19+
topK: 40,
20+
repeatPenalty: 1,
21+
repeatLastN: 64,
22+
seed: 0,
23+
});
24+
25+
fs.writeFileSync(
26+
path.resolve(process.cwd(), file),
27+
JSON.stringify(response)
4128
);
4229
};
4330

44-
const dog1 = `My favourite animal is the dog`;
45-
getWordEmbeddings(dog1, "./example/semantic-compare/dog1.json");
31+
const run = async () => {
32+
const llama = await LLama.create({
33+
path: model,
34+
numCtxTokens: 128,
35+
});
36+
37+
const dog1 = `My favourite animal is the dog`;
38+
getWordEmbeddings(llama, dog1, "./example/semantic-compare/dog1.json");
4639

47-
const dog2 = `I have just adopted a cute dog`;
48-
getWordEmbeddings(dog2, "./example/semantic-compare/dog2.json");
40+
const dog2 = `I have just adopted a cute dog`;
41+
getWordEmbeddings(llama, dog2, "./example/semantic-compare/dog2.json");
42+
43+
const cat1 = `My favourite animal is the cat`;
44+
getWordEmbeddings(llama, cat1, "./example/semantic-compare/cat1.json");
45+
};
4946

50-
const cat1 = `My favourite animal is the cat`;
51-
getWordEmbeddings(cat1, "./example/semantic-compare/cat1.json");
47+
run();

packages/core/example/inference.ts

Lines changed: 33 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -6,45 +6,47 @@ const model = path.resolve(process.cwd(), "../../ggml-alpaca-7b-q4.bin");
66

77
LLama.enableLogger();
88

9-
const llama = LLama.create({
10-
path: model,
11-
numCtxTokens: 128,
12-
});
9+
const run = async () => {
10+
const llama = await LLama.create({
11+
path: model,
12+
numCtxTokens: 128,
13+
});
1314

14-
const template = `how are you`;
15+
const template = `how are you`;
1516

16-
const prompt = `Below is an instruction that describes a task. Write a response that appropriately completes the request.
17+
const prompt = `Below is an instruction that describes a task. Write a response that appropriately completes the request.
1718
1819
### Instruction:
1920
2021
${template}
2122
2223
### Response:`;
2324

24-
llama.inference(
25-
{
26-
prompt,
27-
numPredict: 128,
28-
temp: 0.2,
29-
topP: 1,
30-
topK: 40,
31-
repeatPenalty: 1,
32-
repeatLastN: 64,
33-
seed: 0,
34-
feedPrompt: true,
35-
// persistSession,
36-
},
37-
(response) => {
38-
switch (response.type) {
39-
case InferenceResultType.Data: {
40-
process.stdout.write(response.data?.token ?? "");
41-
break;
42-
}
43-
case InferenceResultType.End:
44-
case InferenceResultType.Error: {
45-
console.log(response);
46-
break;
25+
llama.inference(
26+
{
27+
prompt,
28+
numPredict: 128,
29+
temp: 0.2,
30+
topP: 1,
31+
topK: 40,
32+
repeatPenalty: 1,
33+
repeatLastN: 64,
34+
seed: 0,
35+
feedPrompt: true,
36+
},
37+
(response) => {
38+
switch (response.type) {
39+
case InferenceResultType.Data: {
40+
process.stdout.write(response.data?.token ?? "");
41+
break;
42+
}
43+
case InferenceResultType.End:
44+
case InferenceResultType.Error: {
45+
console.log(response);
46+
break;
47+
}
4748
}
4849
}
49-
}
50-
);
50+
);
51+
};
52+
run();

packages/core/example/loadsession.ts

Lines changed: 32 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,34 +6,38 @@ const loadSession = path.resolve(process.cwd(), "./tmp/session.bin");
66

77
LLama.enableLogger();
88

9-
const llama = LLama.create({
10-
path: model,
11-
numCtxTokens: 128,
12-
});
9+
const run = async () => {
10+
const llama = await LLama.create({
11+
path: model,
12+
numCtxTokens: 128,
13+
});
1314

14-
llama.inference(
15-
{
16-
prompt: "",
17-
numPredict: 128,
18-
temp: 0.2,
19-
topP: 1,
20-
topK: 40,
21-
repeatPenalty: 1,
22-
repeatLastN: 64,
23-
seed: 0,
24-
loadSession,
25-
},
26-
(response) => {
27-
switch (response.type) {
28-
case InferenceResultType.Data: {
29-
process.stdout.write(response.data?.token ?? "");
30-
break;
31-
}
32-
case InferenceResultType.End:
33-
case InferenceResultType.Error: {
34-
console.log(response);
35-
break;
15+
llama.inference(
16+
{
17+
prompt: "",
18+
numPredict: 128,
19+
temp: 0.2,
20+
topP: 1,
21+
topK: 40,
22+
repeatPenalty: 1,
23+
repeatLastN: 64,
24+
seed: 0,
25+
loadSession,
26+
},
27+
(response) => {
28+
switch (response.type) {
29+
case InferenceResultType.Data: {
30+
process.stdout.write(response.data?.token ?? "");
31+
break;
32+
}
33+
case InferenceResultType.End:
34+
case InferenceResultType.Error: {
35+
console.log(response);
36+
break;
37+
}
3638
}
3739
}
38-
}
39-
);
40+
);
41+
};
42+
43+
run();

packages/core/example/tokenize.ts

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,17 @@ const model = path.resolve(process.cwd(), "../../ggml-alpaca-7b-q4.bin");
55

66
LLama.enableLogger();
77

8-
const llama = LLama.create({
9-
path: model,
10-
numCtxTokens: 128,
11-
});
8+
const run = async () => {
9+
const llama = await LLama.create({
10+
path: model,
11+
numCtxTokens: 128,
12+
});
1213

13-
const prompt = "My favourite animal is the cat";
14+
const prompt = "My favourite animal is the cat";
1415

15-
llama.tokenize(prompt, (response) => {
16-
console.log(response);
17-
console.log(response.data.length); // 7
18-
});
16+
const tokens = await llama.tokenize(prompt);
17+
18+
console.log(tokens);
19+
};
20+
21+
run();

0 commit comments

Comments
 (0)