-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathLLM_runtime.h
More file actions
284 lines (231 loc) · 13.6 KB
/
LLM_runtime.h
File metadata and controls
284 lines (231 loc) · 13.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
/// @file LLM_runtime.h
/// @brief Runtime loading and management of LLM libraries
/// @ingroup llm
/// @details Provides dynamic library loading capabilities for LLM backends,
/// architecture detection, and cross-platform library management
#pragma once
#include <fstream>
#include <sstream>
#include <vector>
#include <iostream>
#include <setjmp.h>
#include <type_traits>
#include <algorithm>
#include <cstdlib>
#include "defs.h"
#include "error_handling.h"
#include "LLM.h"
#if defined(_WIN32) || defined(__linux__)
#include "archchecker.h"
#endif
// Platform-specific library loading definitions
#if defined(_WIN32)
#include <windows.h>
#include <libloaderapi.h>
using LibHandle = HMODULE; ///< Windows library handle type
#define LOAD_LIB(path) LoadLibraryA(path) ///< Load library macro for Windows
#define GET_SYM(handle, name) GetProcAddress(handle, name) ///< Get symbol macro for Windows
#define CLOSE_LIB(handle) FreeLibrary(handle) ///< Close library macro for Windows
#else
#include <dlfcn.h>
#include <unistd.h>
#include <limits.h>
#ifdef __APPLE__
#include <mach-o/dyld.h>
#endif
using LibHandle = void *; ///< Unix library handle type
#define LOAD_LIB(path) dlopen(path, RTLD_LAZY) ///< Load library macro for Unix
#define GET_SYM(handle, name) dlsym(handle, name) ///< Get symbol macro for Unix
#define CLOSE_LIB(handle) dlclose(handle) ///< Close library macro for Unix
#endif
//=================================== FUNCTION LISTS ===================================//
/// @brief Macro defining the list of dynamically loaded LLM functions
/// @param M Macro to apply to each function signature
/// @details This macro is used to generate function pointer declarations and loading code
#define LLM_FUNCTIONS_LIST(M) \
M(LLMService_Registry, void, LLMProviderRegistry *) \
M(LLMService_InjectErrorState, void, ErrorState *) \
M(LLMService_Supports_GPU, bool) \
M(LLMService_Construct, LLMProvider *, const char *, int, int, int, bool, int, int, bool, int, const char **) \
M(LLMService_From_Command, LLMProvider *, const char *)
/// @brief Runtime loader for LLM libraries
/// @details This class provides dynamic loading of LLM backend libraries,
/// allowing for flexible deployment and architecture-specific optimizations
class UNDREAMAI_API LLMService : public LLMProvider
{
public:
/// @brief Default constructor
/// @details Creates an uninitialized runtime that must load a library before use
LLMService();
/// @brief Parameterized constructor
/// @param model_path Path to the model file
/// @param num_threads Number of CPU threads (-1 for auto-detection)
/// @param num_GPU_layers Number of layers to offload to GPU
/// @param num_slots Number of parallel slots
/// @param flash_attention Whether to enable flash attention optimization
/// @param context_size Maximum context length in tokens
/// @param batch_size Processing batch size
/// @param embedding_only Whether to run in embedding-only mode
/// @param lora_paths Vector of paths to LoRA adapter files
/// @details Creates and initializes a runtime with the specified parameters
LLMService(const std::string &model_path, int num_slots = 1, int num_threads = -1, int num_GPU_layers = 0, bool flash_attention = false, int context_size = 4096, int batch_size = 2048, bool embedding_only = false, const std::vector<std::string> &lora_paths = {});
/// @brief Destructor
~LLMService();
/// @brief Create runtime from command line string
/// @param command Command line argument string
/// @return Pointer to newly created LLMService instance
/// @details Factory method for creating runtime instances from command arguments.
/// See https://github.com/ggml-org/llama.cpp/tree/master/tools/server#usage for arguments.
static LLMService *from_command(const std::string &command);
/// @brief Create runtime from argc/argv
/// @param argc Argument count
/// @param argv Argument vector
/// @return Pointer to newly created LLMService instance
/// @details Factory method for creating runtime instances from main() parameters
static LLMService *from_command(int argc, char **argv);
LibHandle handle = nullptr; ///< Handle to loaded library
LLMProvider *llm = nullptr; ///< Pointer to loaded LLM provider instance
/// @brief Loads LLM library dynamically according to underlying achitecture and creates a LLM based on the command
/// @param command Command string containing model path and parameters
/// @return true if library loaded successfully, false otherwise
bool create_LLM_library(const std::string &command);
//=================================== LLM METHODS START ===================================//
/// @brief Tokenize input (override)
/// @param data JSON object containing text to tokenize
/// @return JSON string with token data
std::string tokenize_json(const json &data) override { return ((LLMProvider *)llm)->tokenize_json(data); }
/// @brief Convert tokens back to text
/// @param data JSON object containing token IDs
/// @return JSON string containing detokenized text
/// @details Pure virtual method for converting token sequences back to text
std::string detokenize_json(const json &data) override { return ((LLMProvider *)llm)->detokenize_json(data); }
/// @brief Generate embeddings with HTTP response support
/// @param data JSON object containing embedding request
/// @return JSON string with embedding data
/// @details Protected method used internally for server-based embedding generation
std::string embeddings_json(const json &data) override { return ((LLMProvider *)llm)->embeddings_json(data); }
/// @brief Generate completion (override - delegates to loaded library)
/// @param data JSON completion request
/// @param callback Optional streaming callback
/// @param callbackWithJSON Whether callback uses JSON
/// @return Generated completion
std::string completion_json(const json &data, CharArrayFn callback = nullptr, bool callbackWithJSON = true) override { return ((LLMProvider *)llm)->completion_json(data, callback, callbackWithJSON); }
/// @brief Apply a chat template to message data
/// @param data JSON object containing messages to format
/// @return Formatted string with template applied
/// @details Pure virtual method for applying chat templates to conversation data
std::string apply_template_json(const json &data) override { return ((LLMProvider *)llm)->apply_template_json(data); }
/// @brief Cancel request (override - delegates to loaded library)
/// @param data JSON cancellation request
void cancel(int id_slot) override { ((LLMProvider *)llm)->cancel(id_slot); }
/// @brief Configure LoRA weights with HTTP response support
/// @param data JSON object with LoRA configuration
/// @return JSON response string
/// @details Protected method used internally for server-based LoRA configuration
std::string lora_weight_json(const json &data) override { return ((LLMProvider *)llm)->lora_weight_json(data); };
/// @brief List available LoRA adapters
/// @return JSON string containing list of available LoRA adapters
std::string lora_list_json() override { return ((LLMProvider *)llm)->lora_list_json(); }
/// @brief Manage slots with HTTP response support
/// @param data JSON object with slot operation
/// @return JSON response string
/// @details Protected method used internally for server-based slot management
std::string slot_json(const json &data) override { return ((LLMProvider *)llm)->slot_json(data); }
/// @brief Start HTTP server (override - delegates to loaded library)
/// @param host Host address (default: "0.0.0.0")
/// @param port Port number (0 for auto)
/// @param API_key Optional API key
void start_server(const std::string &host = "0.0.0.0", int port = -1, const std::string &API_key = "") override { ((LLMProvider *)llm)->start_server(host, port, API_key); }
/// @brief Stop HTTP server (override - delegates to loaded library)
void stop_server() override { ((LLMProvider *)llm)->stop_server(); }
/// @brief Start service (override - delegates to loaded library)
void start() override { ((LLMProvider *)llm)->start(); }
/// @brief Check service status (override - delegates to loaded library)
/// @return true if started, false otherwise
bool started() override { return ((LLMProvider *)llm)->started(); }
/// @brief Stop service (override - delegates to loaded library)
void stop() override
{
((LLMProvider *)llm)->stop();
}
/// @brief Wait for service completion (override - delegates to loaded library)
void join_service() override { ((LLMProvider *)llm)->join_service(); }
/// @brief Wait for server completion (override - delegates to loaded library)
void join_server() override { ((LLMProvider *)llm)->join_server(); }
/// @brief Set SSL configuration (override - delegates to loaded library)
/// @param cert SSL certificate path
/// @param key SSL private key path
void set_SSL(const std::string &cert, const std::string &key) override { ((LLMProvider *)llm)->set_SSL(cert, key); }
/// @brief Get embedding size (override - delegates to loaded library)
/// @return Number of embedding dimensions
int embedding_size() override { return ((LLMProvider *)llm)->embedding_size(); }
/// @brief Get available slot (override - delegates to loaded library)
/// @return Available slot ID
int get_next_available_slot() override { return ((LLMProvider *)llm)->get_next_available_slot(); }
/// @brief Get slot context size (override - delegates to loaded library)
/// @return Slot context size
int get_slot_context_size() override { return ((LLMProvider *)llm)->get_slot_context_size(); }
/// @brief Set debug level (override - delegates to loaded library)
/// @param debug_level Debug verbosity level
void debug(int debug_level) override { ((LLMProvider *)llm)->debug(debug_level); }
/// @brief Set logging callback (override - delegates to loaded library)
/// @param callback Function to receive log messages
void logging_callback(CharArrayFn callback) override { ((LLMProvider *)llm)->logging_callback(callback); }
std::string debug_implementation() override { return "runtime_detection"; }
//=================================== LLM METHODS END ===================================//
/// @brief Declare function pointers for dynamically loaded functions
/// @details Uses the LLM_FUNCTIONS_LIST macro to declare all required function pointers
#define DECLARE_FN(name, ret, ...) \
ret (*name)(__VA_ARGS__) = nullptr;
LLM_FUNCTIONS_LIST(DECLARE_FN)
#undef DECLARE_FN
protected:
std::vector<std::string> search_paths; ///< Library search paths
/// @brief Load LLM library backend
/// @param command Command string with parameters
/// @param llm_lib_filename Specific library filename to load
/// @return true if library loaded successfully, false otherwise
/// @details Internal method for loading specific library files
bool create_LLM_library_backend(const std::string &command, const std::string &llm_lib_filename, bool is_gpu_library=false);
};
/// @brief Get OS-specific library directory
/// @return Path to platform-specific library directory
/// @details Returns the appropriate library directory for the current operating system
const std::string os_library_dir();
/// @brief Get available architectures for the platform
/// @param gpu Whether to include GPU-enabled architectures
/// @return Vector of available architecture strings
/// @details Detects available CPU/GPU architectures for library selection
const std::vector<std::string> available_architectures(bool gpu);
/// @brief Get directory containing the current executable
/// @return Path to executable directory
/// @details Helper function for locating libraries relative to executable
static std::string get_executable_directory();
/// @brief Get current working directory
/// @return Path to current directory
/// @details Helper function for relative path resolution
static std::string get_current_directory();
/// @brief Get library paths from environment variables
/// @param env_vars Vector of environment variable names to check
/// @return Vector of paths found in environment variables
/// @details Extracts library search paths from specified environment variables
static std::vector<std::string> get_env_library_paths(const std::vector<std::string> &env_vars);
/// @brief Get standard library search directories
/// @return Vector of standard search directory paths
/// @details Returns platform-specific standard directories for library searches
static std::vector<std::string> get_search_directories();
/// @brief Get default environment variables for library paths
/// @return Vector of environment variable names to check for library paths
/// @details Returns platform-specific environment variables used for library loading
std::vector<std::string> get_default_library_env_vars();
//=================================== EXTERNAL API ===================================//
/// @ingroup c_api
/// @{
extern "C"
{
/// @brief Get available architectures (C API)
/// @param gpu Whether to include GPU architectures
/// @return JSON string containing available architectures
UNDREAMAI_API const char *Available_Architectures(bool gpu);
}
/// @}