forked from openvinotoolkit/openvino.genai
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpy_llm_pipeline.cpp
251 lines (227 loc) · 11.2 KB
/
py_llm_pipeline.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
// Copyright (C) 2023-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#include <filesystem>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <pybind11/stl_bind.h>
#include <pybind11/stl/filesystem.h>
#include <pybind11/functional.h>
#include "openvino/genai/llm_pipeline.hpp"
#include "tokenizers_path.hpp"
#include "py_utils.hpp"
namespace py = pybind11;
namespace pyutils = ov::genai::pybind::utils;
using ov::genai::OptionalGenerationConfig;
using ov::genai::LLMPipeline;
using ov::genai::TokenizedInputs;
using ov::genai::EncodedInputs;
using ov::genai::StreamerVariant;
using ov::genai::DecodedResults;
using ov::genai::Tokenizer;
using ov::genai::draft_model;
namespace {
auto generate_docstring = R"(
Generates sequences or tokens for LLMs. If input is a string or list of strings then resulting sequences will be already detokenized.
:param inputs: inputs in the form of string, list of strings or tokenized input_ids
:type inputs: str, List[str], ov.genai.TokenizedInputs, or ov.Tensor
:param generation_config: generation_config
:type generation_config: GenerationConfig or a dict
:param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped
:type : Callable[[str], bool], ov.genai.StreamerBase
:param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields.
:type : dict
:return: return results in encoded, or decoded form depending on inputs type
:rtype: DecodedResults, EncodedResults, str
)";
py::object call_common_generate(
LLMPipeline& pipe,
const std::variant<ov::Tensor, TokenizedInputs, std::string, std::vector<std::string>>& inputs,
const OptionalGenerationConfig& config,
const pyutils::PyBindStreamerVariant& py_streamer,
const py::kwargs& kwargs
) {
ov::genai::GenerationConfig default_config = config.has_value() ? *config : pipe.get_generation_config();
auto updated_config = pyutils::update_config_from_kwargs(default_config, kwargs);
py::object results;
StreamerVariant streamer = pyutils::pystreamer_to_streamer(py_streamer);
// Call suitable generate overload for each type of input.
std::visit(pyutils::overloaded {
[&](ov::Tensor ov_tensor) {
ov::genai::EncodedResults encoded_results;
{
py::gil_scoped_release rel;
encoded_results = pipe.generate(ov_tensor, updated_config, streamer);
}
results = py::cast(encoded_results);
},
[&](TokenizedInputs tokenized_input) {
ov::genai::EncodedResults encoded_results;
{
py::gil_scoped_release rel;
encoded_results = pipe.generate(tokenized_input, updated_config, streamer);
}
results = py::cast(encoded_results);
},
[&](std::string string_input) {
DecodedResults res;
{
py::gil_scoped_release rel;
res = pipe.generate(string_input, updated_config, streamer);
}
// If input was a string return a single string otherwise return DecodedResults.
if (updated_config.has_value() && (*updated_config).num_return_sequences == 1) {
results = py::cast<py::object>(pyutils::handle_utf8(res.texts[0]));
} else {
results = py::cast(res);
}
},
[&](std::vector<std::string> string_input) {
// For DecodedResults texts getter already handles utf8 decoding.
DecodedResults res;
{
py::gil_scoped_release rel;
res = pipe.generate(string_input, updated_config, streamer);
}
results = py::cast(res);
}},
inputs);
return results;
}
} // namespace
extern char generation_config_docstring[];
void init_llm_pipeline(py::module_& m) {
py::class_<LLMPipeline>(m, "LLMPipeline", "This class is used for generation with LLMs")
// init(model_path, tokenizer, device, config, kwargs) should be defined before init(model_path, device, config, kwargs)
// to prevent tokenizer treated as kwargs argument
.def(py::init([](
const std::filesystem::path& models_path,
const Tokenizer& tokenizer,
const std::string& device,
const std::map<std::string, py::object>& config,
const py::kwargs& kwargs
) {
ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
ov::AnyMap properties = pyutils::kwargs_to_any_map(kwargs);
if (config.size()) {
PyErr_WarnEx(PyExc_DeprecationWarning,
"'config' parameters is deprecated, please use kwargs to pass config properties instead.",
1);
auto config_properties = pyutils::properties_to_any_map(config);
properties.insert(config_properties.begin(), config_properties.end());
}
return std::make_unique<LLMPipeline>(models_path, tokenizer, device, properties);
}),
py::arg("models_path"),
py::arg("tokenizer"),
py::arg("device"),
py::arg("config") = ov::AnyMap({}), "openvino.properties map",
R"(
LLMPipeline class constructor for manually created openvino_genai.Tokenizer.
models_path (os.PathLike): Path to the model file.
tokenizer (openvino_genai.Tokenizer): tokenizer object.
device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'.
Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline.
kwargs: Device properties.
)")
.def(py::init([](
const std::filesystem::path& models_path,
const std::string& device,
const std::map<std::string, py::object>& config,
const py::kwargs& kwargs
) {
ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
ov::AnyMap properties = pyutils::kwargs_to_any_map(kwargs);
if (config.size()) {
PyErr_WarnEx(PyExc_DeprecationWarning,
"'config' parameters is deprecated, please use kwargs to pass config properties instead.",
1);
auto config_properties = pyutils::properties_to_any_map(config);
properties.insert(config_properties.begin(), config_properties.end());
}
return std::make_unique<LLMPipeline>(models_path, device, properties);
}),
py::arg("models_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files",
py::arg("device"), "device on which inference will be done",
py::arg("config") = ov::AnyMap({}), "openvino.properties map",
R"(
LLMPipeline class constructor.
models_path (os.PathLike): Path to the model file.
device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'.
Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline.
kwargs: Device properties.
)")
.def(py::init([](
const std::string& model,
const ov::Tensor& weights,
const ov::genai::Tokenizer& tokenizer,
const std::string& device,
OptionalGenerationConfig generation_config,
const py::kwargs& kwargs
) {
ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
ov::AnyMap properties = pyutils::kwargs_to_any_map(kwargs);
if (!generation_config.has_value()) {
generation_config = ov::genai::GenerationConfig();
}
return std::make_unique<LLMPipeline>(model, weights, tokenizer, device, properties, *generation_config);
}),
py::arg("model"), "string with pre-read model",
py::arg("weights"), "ov::Tensor with pre-read model weights",
py::arg("tokenizer"), "genai Tokenizers",
py::arg("device"), "device on which inference will be done",
py::arg("generation_config") = py::none(), "genai GenerationConfig (default: None, will use empty config)",
R"(
LLMPipeline class constructor.
model (str): Pre-read model.
weights (ov.Tensor): Pre-read model weights.
tokenizer (str): Genai Tokenizers.
device (str): Device to run the model on (e.g., CPU, GPU).
generation_config {ov_genai.GenerationConfig} Genai GenerationConfig. Default is an empty config.
kwargs: Device properties.
)")
.def(
"generate",
[](LLMPipeline& pipe,
const std::variant<ov::Tensor, TokenizedInputs, std::string, std::vector<std::string>>& inputs,
const OptionalGenerationConfig& generation_config,
const pyutils::PyBindStreamerVariant& streamer,
const py::kwargs& kwargs
) -> py::typing::Union<ov::genai::EncodedResults, ov::genai::DecodedResults> {
return call_common_generate(pipe, inputs, generation_config, streamer, kwargs);
},
py::arg("inputs"), "Input string, or list of string or encoded tokens",
py::arg("generation_config") = std::nullopt, "generation_config",
py::arg("streamer") = std::monostate(), "streamer",
(generate_docstring + std::string(" \n ") + generation_config_docstring).c_str()
)
.def(
"__call__",
[](LLMPipeline& pipe,
const std::variant<ov::Tensor, TokenizedInputs, std::string, std::vector<std::string>>& inputs,
const OptionalGenerationConfig& generation_config,
const pyutils::PyBindStreamerVariant& streamer,
const py::kwargs& kwargs
) -> py::typing::Union<ov::genai::EncodedResults, ov::genai::DecodedResults> {
return call_common_generate(pipe, inputs, generation_config, streamer, kwargs);
},
py::arg("inputs"), "Input string, or list of string or encoded tokens",
py::arg("generation_config") = std::nullopt, "generation_config",
py::arg("streamer") = std::monostate(), "streamer",
(generate_docstring + std::string(" \n ") + generation_config_docstring).c_str()
)
.def("get_tokenizer", &LLMPipeline::get_tokenizer)
.def("start_chat", &LLMPipeline::start_chat, py::arg("system_message") = "")
.def("finish_chat", &LLMPipeline::finish_chat)
.def("get_generation_config", &LLMPipeline::get_generation_config, py::return_value_policy::copy)
.def("set_generation_config", &LLMPipeline::set_generation_config, py::arg("config"));
m.def("draft_model", [](
const std::filesystem::path& models_path,
const std::string& device,
const py::kwargs& kwargs
) {
ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
return draft_model(models_path, device, pyutils::kwargs_to_any_map(kwargs)).second;
},
py::arg("models_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files",
py::arg("device") = "", "device on which inference will be performed");
}