|
| 1 | +/* Copyright 2025 The xLLM Authors. All Rights Reserved. |
| 2 | +
|
| 3 | +Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +you may not use this file except in compliance with the License. |
| 5 | +You may obtain a copy of the License at |
| 6 | +
|
| 7 | + https://github.com/jd-opensource/xllm/blob/main/LICENSE |
| 8 | +
|
| 9 | +Unless required by applicable law or agreed to in writing, software |
| 10 | +distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +See the License for the specific language governing permissions and |
| 13 | +limitations under the License. |
| 14 | +==============================================================================*/ |
| 15 | + |
| 16 | +#pragma once |
| 17 | +#include "pipeline_flux_base.h" |
| 18 | +#include "processors/siglip_image_processor.h" |
| 19 | +#include "siglip_vision_model.h" |
| 20 | +// pipeline_flux_prior_redux compatible with huggingface weights |
| 21 | +// ref to: |
| 22 | +// https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/flux/pipeline_flux_prior_redux.py |
| 23 | + |
| 24 | +namespace xllm { |
| 25 | + |
| 26 | +class ReduxImageEncoderImpl : public torch::nn::Module { |
| 27 | + public: |
| 28 | + explicit ReduxImageEncoderImpl(const ModelContext& context) { |
| 29 | + auto model_args = context.get_model_args(); |
| 30 | + auto options = context.get_tensor_options(); |
| 31 | + act_ = register_module("act", torch::nn::Functional(torch::silu)); |
| 32 | + |
| 33 | + redux_up_ = register_module("redux_up", |
| 34 | + DiTLinear(model_args.mm_hidden_size(), |
| 35 | + model_args.mm_intermediate_size() * 3, |
| 36 | + true)); |
| 37 | + redux_down_ = |
| 38 | + register_module("redux_down", |
| 39 | + DiTLinear(model_args.mm_intermediate_size() * 3, |
| 40 | + model_args.mm_intermediate_size(), |
| 41 | + true)); |
| 42 | + redux_up_->to(options); |
| 43 | + redux_down_->to(options); |
| 44 | + } |
| 45 | + |
| 46 | + torch::Tensor forward(const torch::Tensor& hidden_states) { |
| 47 | + return redux_down_(act_(redux_up_(hidden_states))); |
| 48 | + } |
| 49 | + |
| 50 | + void load_model(std::unique_ptr<DiTFolderLoader> loader) { |
| 51 | + for (const auto& state_dict : loader->get_state_dicts()) { |
| 52 | + redux_up_->load_state_dict(state_dict->get_dict_with_prefix("redux_up.")); |
| 53 | + redux_up_weight_loaded_ = true; |
| 54 | + redux_up_bias_loaded_ = true; |
| 55 | + redux_down_->load_state_dict( |
| 56 | + state_dict->get_dict_with_prefix("redux_down.")); |
| 57 | + redux_down_weight_loaded_ = true; |
| 58 | + redux_down_bias_loaded_ = true; |
| 59 | + } |
| 60 | + } |
| 61 | + |
| 62 | + void verify_loaded_weights(const std::string& prefix) const { |
| 63 | + CHECK(redux_up_weight_loaded_) |
| 64 | + << "weight is not loaded for " << prefix + "redux_up.weight"; |
| 65 | + CHECK(redux_up_bias_loaded_) |
| 66 | + << "weight is not loaded for " << prefix + "redux_up.bias"; |
| 67 | + CHECK(redux_down_weight_loaded_) |
| 68 | + << "weight is not loaded for " << prefix + "redux_down.weight"; |
| 69 | + CHECK(redux_down_bias_loaded_) |
| 70 | + << "weight is not loaded for " << prefix + "redux_down.bias"; |
| 71 | + } |
| 72 | + |
| 73 | + private: |
| 74 | + DiTLinear redux_up_{nullptr}; |
| 75 | + DiTLinear redux_down_{nullptr}; |
| 76 | + |
| 77 | + torch::nn::Functional act_ = nullptr; |
| 78 | + bool redux_up_weight_loaded_ = false; |
| 79 | + bool redux_up_bias_loaded_ = false; |
| 80 | + bool redux_down_weight_loaded_ = false; |
| 81 | + bool redux_down_bias_loaded_ = false; |
| 82 | +}; |
| 83 | +TORCH_MODULE(ReduxImageEncoder); |
| 84 | + |
| 85 | +REGISTER_MODEL_ARGS(ReduxImageEncoder, [&] { |
| 86 | + LOAD_ARG_OR(dtype, "torch_dtype", "bfloat16"); |
| 87 | + LOAD_ARG_OR(mm_hidden_size, "redux_dim", 1152); |
| 88 | + LOAD_ARG_OR(mm_intermediate_size, "txt_in_features", 4096); |
| 89 | +}); |
| 90 | + |
| 91 | +class FluxPriorReduxPipelineImpl : public FluxPipelineBaseImpl { |
| 92 | + public: |
| 93 | + FluxPriorReduxPipelineImpl(const DiTModelContext& context) { |
| 94 | + auto model_args = context.get_model_args("feature_extractor"); |
| 95 | + options_ = context.get_tensor_options(); |
| 96 | + image_encoder_ = |
| 97 | + SiglipVisionModel(context.get_model_context("image_encoder")); |
| 98 | + image_embedder_ = |
| 99 | + ReduxImageEncoder(context.get_model_context("image_embedder")); |
| 100 | + feature_extractor_ = std::make_unique<SiglipImageProcessor>(model_args); |
| 101 | + } |
| 102 | + |
| 103 | + void load_model(std::unique_ptr<DiTModelLoader> loader) { |
| 104 | + std::string model_path = loader->model_root_path(); |
| 105 | + auto image_encoder_loader = loader->take_component_loader("image_encoder"); |
| 106 | + auto image_embedder_loader = |
| 107 | + loader->take_component_loader("image_embedder"); |
| 108 | + image_encoder_->load_model(std::move(image_encoder_loader)); |
| 109 | + image_encoder_->to(options_.device()); |
| 110 | + image_embedder_->load_model(std::move(image_embedder_loader)); |
| 111 | + image_embedder_->to(options_.device()); |
| 112 | + } |
| 113 | + |
| 114 | + torch::Tensor encode_image(const torch::Tensor& image, |
| 115 | + int64_t num_images_per_prompt) { |
| 116 | + auto imgs = feature_extractor_->preprocess(image).to(options_); |
| 117 | + auto image_enc_hidden_states = image_encoder_->forward(imgs); |
| 118 | + image_enc_hidden_states = |
| 119 | + image_enc_hidden_states.repeat_interleave(num_images_per_prompt, 0); |
| 120 | + return image_enc_hidden_states; |
| 121 | + } |
| 122 | + |
| 123 | + DiTForwardOutput forward(const DiTForwardInput& input) { |
| 124 | + const auto& generation_params = input.generation_params; |
| 125 | + auto image = input.images.defined() ? std::make_optional(input.images) |
| 126 | + : std::nullopt; |
| 127 | + auto prompt_embeds = input.prompt_embeds.defined() |
| 128 | + ? std::make_optional(input.prompt_embeds) |
| 129 | + : std::nullopt; |
| 130 | + auto pooled_prompt_embeds = |
| 131 | + input.pooled_prompt_embeds.defined() |
| 132 | + ? std::make_optional(input.pooled_prompt_embeds) |
| 133 | + : std::nullopt; |
| 134 | + auto prompt_embeds_scale = generation_params.prompt_embeds_scale; |
| 135 | + auto pooled_prompt_embeds_scale = |
| 136 | + generation_params.pooled_prompt_embeds_scale; |
| 137 | + std::vector<torch::Tensor> output = forward_(image.value(), |
| 138 | + prompt_embeds, |
| 139 | + pooled_prompt_embeds, |
| 140 | + generation_params.height, |
| 141 | + generation_params.width, |
| 142 | + prompt_embeds_scale, |
| 143 | + pooled_prompt_embeds_scale); |
| 144 | + DiTForwardOutput out; |
| 145 | + out.tensors = output; |
| 146 | + return out; |
| 147 | + } |
| 148 | + |
| 149 | + std::vector<torch::Tensor> forward_( |
| 150 | + torch::Tensor image, |
| 151 | + std::optional<torch::Tensor> prompt_embeds_opt, |
| 152 | + std::optional<torch::Tensor> pooled_prompt_embeds_opt, |
| 153 | + int64_t height = 384, |
| 154 | + int64_t width = 384, |
| 155 | + float prompt_embeds_scale = 1.0f, |
| 156 | + float pooled_prompt_embeds_scale = 1.0f) { |
| 157 | + torch::NoGradGuard no_grad; |
| 158 | + int64_t batch_size = image.dim() == 4 ? image.size(0) : 1; |
| 159 | + torch::Tensor image_latents = |
| 160 | + encode_image(image, /*num_images_per_prompt=*/1); |
| 161 | + torch::Tensor image_embeds = |
| 162 | + image_embedder_->forward(image_latents).to(options_); |
| 163 | + |
| 164 | + // prompt_embeds: [batch_size, seq_len, hidden_dim] |
| 165 | + torch::Tensor prompt_embeds = prompt_embeds_opt.value_or( |
| 166 | + torch::zeros({batch_size, 512, 4096}, options_)); |
| 167 | + // pooled_prompt_embeds: [batch_size, pooled_hidden_dim] |
| 168 | + torch::Tensor pooled_prompt_embeds = pooled_prompt_embeds_opt.value_or( |
| 169 | + torch::zeros({batch_size, 768}, options_)); |
| 170 | + |
| 171 | + prompt_embeds = torch::cat({prompt_embeds, image_embeds}, /*dim=*/1); |
| 172 | + prompt_embeds *= torch::full({batch_size}, prompt_embeds_scale, options_) |
| 173 | + .view({-1, 1, 1}); |
| 174 | + pooled_prompt_embeds *= |
| 175 | + torch::full({batch_size}, pooled_prompt_embeds_scale, options_) |
| 176 | + .view({-1, 1}); |
| 177 | + |
| 178 | + prompt_embeds = torch::sum(prompt_embeds, /*dim=*/0, /*keepdim=*/true); |
| 179 | + pooled_prompt_embeds = |
| 180 | + torch::sum(pooled_prompt_embeds, /*dim=*/0, /*keepdim=*/true); |
| 181 | + |
| 182 | + return {prompt_embeds, pooled_prompt_embeds}; |
| 183 | + } |
| 184 | + |
| 185 | + private: |
| 186 | + SiglipVisionModel image_encoder_{nullptr}; |
| 187 | + std::unique_ptr<SiglipImageProcessor> feature_extractor_; |
| 188 | + ReduxImageEncoder image_embedder_{nullptr}; |
| 189 | +}; |
| 190 | +TORCH_MODULE(FluxPriorReduxPipeline); |
| 191 | + |
| 192 | +REGISTER_DIT_MODEL(fluxredux, FluxPriorReduxPipeline); |
| 193 | +} // namespace xllm |
0 commit comments