Skip to content

Commit b15ef77

Browse files
goyaladitya05goyal
andauthored
Added GGUF format support for LoRA adapters (openvinotoolkit#3204)
## Description This PR implements support for LoRA adapters in GGUF format, extending the existing safetensors-only support as mentioned in the project roadmap. ## Changes - Added `GGUFAdapterImpl` class to handle GGUF adapter loading - Implemented `convert_gguf_name_to_hf()` function to map GGUF tensor naming conventions to HuggingFace/OpenVINO conventions - Modified `Adapter` constructor to detect `.gguf` extension and route to appropriate implementation - Added `ENABLE_GGUF` preprocessor guards for conditional compilation ## Implementation Details The implementation: - Reuses existing `get_gguf_data()` utilities from the GGUF infrastructure - Follows the same design pattern as `SafetensorsAdapterImpl` for consistency - Supports standard GGUF tensor naming conventions (`blk.N.`, `attn_q`, `ffn_gate`, etc.) - Automatically maps GGUF names to HuggingFace/OpenVINO conventions ## Testing **Test Setup:** - Model: TinyLlama-1.1B-Chat-v1.0 (OpenVINO format) - Adapter: tinyllama-function-call-lora-adapter-250424-f16.gguf **Results:** Adapter object created successfully Pipeline initialized with GGUF adapter Text generation completed successfully Output quality verified ## Related Implemented support of GGUF LoRA Closes openvinotoolkit#2323 cc @rkazants --------- Co-authored-by: goyal <goyaladitya070@gmail.com>
1 parent d207d0b commit b15ef77

File tree

2 files changed

+506
-2
lines changed

2 files changed

+506
-2
lines changed

src/cpp/src/lora/adapter.cpp

Lines changed: 127 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,14 @@
4343
#include "lora/common.hpp"
4444
#include "lora/names_mapping.hpp"
4545

46+
#ifdef ENABLE_GGUF
47+
#include <algorithm>
48+
#include <cctype>
49+
#include <vector>
50+
#include <utility>
51+
#include "gguf_utils/gguf.hpp"
52+
#endif
53+
4654
extern "C" {
4755
#include "safetensors.h"
4856
}
@@ -1063,6 +1071,115 @@ class SafetensorsAdapterImpl : public AdapterImpl {
10631071
};
10641072

10651073

1074+
#ifdef ENABLE_GGUF
1075+
// Helper to convert GGUF tensor names to OpenVINO/HF names
1076+
std::string convert_gguf_name_to_hf(const std::string& name) {
1077+
// 1. Handle blocks: blk.N. -> model.layers.N.
1078+
std::string new_name = name;
1079+
size_t pos = 0;
1080+
while ((pos = new_name.find("blk.", pos)) != std::string::npos) {
1081+
size_t num_start = pos + 4; // after "blk."
1082+
size_t num_end = new_name.find('.', num_start);
1083+
if (num_end != std::string::npos) {
1084+
std::string layer_num = new_name.substr(num_start, num_end - num_start);
1085+
// Verify it's actually a number
1086+
bool is_number = !layer_num.empty() &&
1087+
std::all_of(layer_num.begin(), layer_num.end(),
1088+
[](unsigned char c){ return std::isdigit(c); });
1089+
if (is_number) {
1090+
std::string replacement = "model.layers." + layer_num + ".";
1091+
new_name.replace(pos, num_end - pos + 1, replacement);
1092+
pos += replacement.length();
1093+
} else {
1094+
pos = num_end;
1095+
}
1096+
} else {
1097+
break;
1098+
}
1099+
}
1100+
1101+
// 2. Handle specific layer components
1102+
// Attention and Norms
1103+
static const std::vector<std::pair<std::string, std::string>> replacements = {
1104+
{"attn_norm", "input_layernorm"},
1105+
{"ffn_norm", "post_attention_layernorm"},
1106+
{"attn_q", "self_attn.q_proj"},
1107+
{"attn_k", "self_attn.k_proj"},
1108+
{"attn_v", "self_attn.v_proj"},
1109+
{"attn_output", "self_attn.o_proj"},
1110+
{"ffn_gate", "mlp.gate_proj"},
1111+
{"ffn_up", "mlp.up_proj"},
1112+
{"ffn_down", "mlp.down_proj"},
1113+
// Qwen-specific attention normalization mappings
1114+
{"attn_k_norm", "self_attn.k_norm"},
1115+
{"attn_q_norm", "self_attn.q_norm"},
1116+
// Global components
1117+
{"token_embd", "model.embed_tokens"},
1118+
{"output_norm", "model.norm"},
1119+
{"output", "lm_head"}
1120+
};
1121+
1122+
for (const auto& [gguf_part, hf_part] : replacements) {
1123+
if (gguf_part.empty()) continue; // avoid infinite loop
1124+
size_t pos = 0;
1125+
while ((pos = new_name.find(gguf_part, pos)) != std::string::npos) {
1126+
new_name.replace(pos, gguf_part.length(), hf_part);
1127+
pos += hf_part.length(); // continue after replaced part
1128+
}
1129+
}
1130+
1131+
return new_name;
1132+
}
1133+
1134+
class GGUFAdapterImpl : public AdapterImpl {
1135+
public:
1136+
1137+
GGUFAdapterImpl(const std::filesystem::path& path) {
1138+
// Use get_gguf_data to load raw tensors without enforcing a full model structure
1139+
// This is crucial for adapters which only contain sparse weights
1140+
auto gguf_data = get_gguf_data(path.string());
1141+
auto& raw_tensors = std::get<1>(gguf_data);
1142+
1143+
ConstantMap constant_map;
1144+
for (auto& [name, tensor] : raw_tensors) {
1145+
// Convert GGUF naming convention to Hugging Face / OpenVINO convention
1146+
std::string converted_name = convert_gguf_name_to_hf(name);
1147+
1148+
auto constant = std::make_shared<v0::Constant>(tensor.get_element_type(), tensor.get_shape(), tensor.data());
1149+
constant->get_rt_info()["__gguf_buffer_holder"] = tensor;
1150+
constant_map[converted_name] = constant;
1151+
}
1152+
1153+
constant_tensors = group_lora_constant_tensors(constant_map, default_lora_constant_patterns());
1154+
for (const auto& constant_tensor : constant_tensors) {
1155+
constant_map.erase(constant_tensor.first);
1156+
}
1157+
tensors = group_lora_tensors(constant_map, default_lora_patterns());
1158+
}
1159+
1160+
const LoRATensors& get_tensors() const override {
1161+
return tensors;
1162+
}
1163+
1164+
const LoRAConstantTensors& get_constant_tensors() const override {
1165+
return constant_tensors;
1166+
}
1167+
1168+
bool eq(const AdapterImpl* other) const override {
1169+
if(auto other_casted = dynamic_cast<const GGUFAdapterImpl*>(other)) {
1170+
return other_casted == this;
1171+
}
1172+
return false;
1173+
}
1174+
1175+
private:
1176+
1177+
LoRATensors tensors;
1178+
LoRAConstantTensors constant_tensors;
1179+
};
1180+
#endif
1181+
1182+
10661183
/// @brief Adapter that derived from another adapter by applying Derivation function.
10671184
/// Two objects instantiated from the same Derivation type are equal when both origins and derivations are equal (while comparing with operator==).
10681185
/// The derivation is postponed to the first call of get_tensors(), giving a way to compare Adapters without applying the derivation.
@@ -1122,8 +1239,16 @@ Adapter flux_adapter_normalization(const Adapter& adapter) {
11221239
Adapter::Adapter(const std::shared_ptr<AdapterImpl>& pimpl) : m_pimpl(pimpl) {}
11231240

11241241

1125-
Adapter::Adapter(const std::filesystem::path& path) :
1126-
m_pimpl(std::make_shared<SafetensorsAdapterImpl>(path)) {
1242+
Adapter::Adapter(const std::filesystem::path& path) {
1243+
if (path.extension() == ".gguf") {
1244+
#ifdef ENABLE_GGUF
1245+
m_pimpl = std::make_shared<GGUFAdapterImpl>(path);
1246+
#else
1247+
OPENVINO_THROW("GGUF support is disabled. Please build with ENABLE_GGUF=ON to use GGUF adapters.");
1248+
#endif
1249+
} else {
1250+
m_pimpl = std::make_shared<SafetensorsAdapterImpl>(path);
1251+
}
11271252
}
11281253

11291254

0 commit comments

Comments
 (0)